0.1 R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

1 PREPARATIONS

1.1 Set working directory

Should include all data files:

  • expanded_data.csv
  • wide_data.csv
  • taxa_sheet_all.csv
  • forest_data.csv

1.2 Load libraries

library(dplyr)
library(vegan)
library(pvclust)
library(corrplot)
library(GGally)
library(gclus)
library(rcompanion)
library(ggplot2)
library(ape)
library(adespatial)
library(ade4)
library(knitr)
library(ridgeline)
library(ggridges)
library(mgcv)
library(viridis)
library(tidyverse)
library(tidyr)
library(readxl)
library(geosphere)
library(lubridate)
library(kableExtra)
library(splines)  
library(ggpubr)
library(ggrepel)
#remotes::install_github("MikkoVihtakari/ggOceanMaps")
library(ggOceanMaps)

1.3 Load all data

#expanded data - DATASET A
data.a <- read.csv("./expanded_data.csv", sep = ",", header=TRUE) #extracted from raw data where counts are translated into rows - This is also called Dataset A 

# wide data - DATASET B 
data.b <- read.csv("./wide_data.csv", sep = ",", header=TRUE) #this file is wide data plus all the env data that I extracted manually - This is also called Dataset B, and will become dataset C when aggregated by site

#Load taxa sheet file - TAXA 
taxa <- read.csv("./taxa_sheet_all.csv", sep = ",", header=TRUE) #all morphotypes and their metadata 

#Load the forest data 
garden_length_df <- read.csv("./forest_data.csv", sep = ",", header=TRUE)

Explanation: Dive_Name is the code for each EX dive. Here the correct names of each location. In some plots only the numbers will be shown, so here is the reference list:

  1. 2304_02 = Big Bend
  2. 2304_05 = Lone Knoll
  3. 2304_07 = Uliaga
  4. 2304_08 = Umnak
  5. 2306_01 = Kodiak
  6. 2306_03 = Giacomini
  7. 2306_04 = Quinn
  8. 2306_05 = Surveyor
  9. 2306_06 = Durgin
  10. 2306_07 = Deep Discover
  11. 2306_08 = Denson
  12. 2306_12 = Noyes
  13. 2306_14 = Chatham
  14. 2306_15 = Middleton
  15. 2306_18 = Gumpy

2 BASIC DATA

Before starting any data manipulation, getting some data summaries and basic statistics

2.1 Summary statistics - morphotype data

Here I calculate the basics, like how many taxa, how many observations etc. This is done before aggregating the data into depth bins, as the data is still complete here and I can work with the rows. This uses the full expanded dataset (Dataset A) and the taxa sheet.

#How many total morphotypes?
nrow(taxa)
## [1] 164
#How many total coral morphotypes?
sum(taxa$Phylum == "Cnidaria")
## [1] 74
#How many total sponge morphotypes?
sum(taxa$Phylum == "Porifera")
## [1] 90
#How many total observations?
nrow(data.a) #counting all rows of the Dataset A
## [1] 15531
#How many coral observations?
sum(data.a$Phylum == "Cnidaria")
## [1] 7317
#How many sponge observations?
sum(data.a$Phylum == "Porifera")
## [1] 8214
#How many coral observations of species level?
sum(data.a$Phylum == "Cnidaria" & data.a$Species != "")
## [1] 4056
#How many sponge observations of species level?
sum(data.a$Phylum == "Porifera" & data.a$Species != "")
## [1] 997
#How many (and which) morphotypes occurred in a single dive only?
# First, aggregate the data to count the occurrences of each morphotype across all dives
morphotype_counts <- data.a %>%
  group_by(Morphotype) %>%
  summarise(Count = n_distinct(Dive.Name))
# Filter the aggregated data to include only morphotypes that occurred in a single dive
morphotypes_single_dive <- morphotype_counts %>%
  filter(Count == 1) #change number for what to check

# Count the number of morphotypes that occurred in a single dive
num_single_dive_morphotypes <- nrow(morphotypes_single_dive)
# Print the number of morphotypes that occurred in a single dive
print(num_single_dive_morphotypes)
## [1] 103
# Frequency of this number:
(100/(length(unique(data.a$Morphotype))))*num_single_dive_morphotypes
## [1] 62.80488
# Can print the morphotypes that occurred in a single dive only: 
#knitr::kable(morphotypes_single_dive, caption = "Morphotypes that occures in a single dive location only")

Now I want to plot density (number of counts) by depth and divided for each phylum. Since some depths have been visited several times, I am scaling the counts by the duration of each dive, which results in an effort-corrected count. This way there is no bias on the density, e.g. if more time spent in 700 m because there were 2 dives. The plot also gives an overview of the highest abundances, and how the abundances change over depth.

# Find the effort of time by depth
# Calculate duration for each dive
# Convert Start.Date and End.Date to POSIXct datetime format
data.a <- data.a %>%
  mutate(
    Start.Time = as.POSIXct(Start.Date, format = "%Y%m%dT%H%M%S", tz = "UTC"),
    End.Time = as.POSIXct(End.Date, format = "%Y%m%dT%H%M%S", tz = "UTC")
  )
  
# Calculate duration for each dive
dive_duration <- data.a %>%
  dplyr::group_by(Dive.Name) %>%
  dplyr::summarize(Duration = as.numeric(difftime(max(End.Time), min(Start.Time), units = "secs")), .groups = "drop")

knitr::kable(dive_duration, caption = "Duration by Dive site (min)")
Duration by Dive site (min)
Dive.Name Duration
EX2304_DIVE02 Big Bend Canyon 18020
EX2304_DIVE05 Lone Knoll Scarp 9490
EX2304_DIVE07 Uliaga Mound Take 2 23204
EX2304_DIVE08 Umnak Canyon 8921
EX2306_Dive01 Kodiak Shelf 15205
EX2306_Dive03 Giacomini Seamount 21482
EX2306_Dive04 Quinn Seamount 14921
EX2306_Dive05 Surveyor Seamount 24769
EX2306_Dive06 Durgin Guyot 23065
EX2306_Dive07 Deep Discover Dome 14308
EX2306_Dive08 Denson Seamount 22300
EX2306_Dive12 Noyes Canyon 19411
EX2306_Dive14 Chatham Seep 22549
EX2306_Dive15 Middleton Canyon 17229
EX2306_Dive18 Gumby Ridge 15838
dive_duration <- dive_duration %>%
  mutate(scaled_time = scale(Duration)) #will be in seconds

# Count occurrences per Depth and Phylum
df_summary <- data.a %>%
  group_by(Depth..m., Phylum, Dive.Name) %>%
  summarise(Count = n(), .groups = "drop")

# Merge dive durations with df_summary
df_corrected <- df_summary %>%
  left_join(dive_duration, by = "Dive.Name") %>% 
  mutate(Duration_hr = as.numeric(Duration) / 3600,  # Convert seconds to hours
         Effort_Corrected_Count = Count / Duration_hr) # Normalize counts by hours

# Plot density with effort correction
ggplot(df_corrected, aes(x = Depth..m., fill = Phylum, weight = Effort_Corrected_Count)) +
  geom_density(alpha = 0.7) +  # Density plot with transparency
  scale_fill_manual(values = c("Cnidaria" = "#d8514e", "Porifera" = "#f2c714")) +  # Custom colors
  scale_x_reverse(breaks=seq(400, 3200, 200)) +  # Flip depth axis so shallowest is on top
  coord_flip() +
  labs(x = "Depth (m)", y = "Effort-Corrected Density", fill = "Phylum") +
  theme_minimal()

#plot without effort correction
ggplot(data.a, aes(x = Depth..m., fill = Phylum)) +
  geom_density(alpha = 0.7) + # Adjust transparency
  scale_fill_manual(values = c("Cnidaria" = "#d8514e", "Porifera" = "#f2c714")) +
  labs(x = "Depth (m)", y = "Density", fill = "Phylum") +
  theme_minimal() +
  coord_flip() +
  scale_x_reverse(breaks=seq(400, 3200, 200)) 

# Bin the df into depth for a test on the abundances by depth - bins in 5 meter, but can be adapted. 

df_corrected <- df_corrected %>%
  mutate(depth_bin = cut(Depth..m., breaks = seq(0, max(Depth..m.), by = 5), include.lowest = TRUE)) %>%
  mutate(depth_bin = as.factor(depth_bin))  

# Is there a linear effect? 
glm_model <- glm(Effort_Corrected_Count ~ Depth..m., data = df_corrected, family = quasipoisson) # need to use quasipoisson because its non-integer counts with the correction 
summary(glm_model) # No
## 
## Call:
## glm(formula = Effort_Corrected_Count ~ Depth..m., family = quasipoisson, 
##     data = df_corrected)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.576e-02  6.598e-02  -0.997    0.319
## Depth..m.    4.992e-05  4.067e-05   1.228    0.220
## 
## (Dispersion parameter for quasipoisson family taken to be 2.974241)
## 
##     Null deviance: 4390.0  on 2802  degrees of freedom
## Residual deviance: 4385.6  on 2801  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
glm_numeric <- glm(Effort_Corrected_Count ~ as.numeric(Depth..m.),  data = df_corrected, family = quasipoisson)
summary(glm_numeric) # No
## 
## Call:
## glm(formula = Effort_Corrected_Count ~ as.numeric(Depth..m.), 
##     family = quasipoisson, data = df_corrected)
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)           -6.576e-02  6.598e-02  -0.997    0.319
## as.numeric(Depth..m.)  4.992e-05  4.067e-05   1.228    0.220
## 
## (Dispersion parameter for quasipoisson family taken to be 2.974241)
## 
##     Null deviance: 4390.0  on 2802  degrees of freedom
## Residual deviance: 4385.6  on 2801  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5
# Is there a non-linear effect?
gam_model <- gam(Effort_Corrected_Count ~ s(Depth..m.), data = df_corrected, family = quasipoisson) 
summary(gam_model) # yes, significant, meaning that there are peaks with higher abundances, but no linear decline. 
## 
## Family: quasipoisson 
## Link function: log 
## 
## Formula:
## Effort_Corrected_Count ~ s(Depth..m.)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.12945    0.03387  -3.822 0.000135 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df     F p-value    
## s(Depth..m.) 8.888  8.996 21.97  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.058   Deviance explained = 13.6%
## GCV = 1.3632  Scale est. = 2.3104    n = 2803
plot(gam_model) 

# Is there an overall depth effect?
anova(glm_model, test = "Chisq") # no 
## Analysis of Deviance Table
## 
## Model: quasipoisson, link: log
## 
## Response: Effort_Corrected_Count
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL                       2802     4390.0         
## Depth..m.  1   4.4372      2801     4385.6   0.2219
# And if we use the non-corrected data? 
glm_model <- glm(Count ~ Depth..m., data = df_summary, family = poisson)
summary(glm_model)  # Poisson - this is significant, but might be biased because of the sampling effort
## 
## Call:
## glm(formula = Count ~ Depth..m., family = poisson, data = df_summary)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.9405898  0.0159777  121.46   <2e-16 ***
## Depth..m.   -0.0001719  0.0000108  -15.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 24165  on 2802  degrees of freedom
## Residual deviance: 23902  on 2801  degrees of freedom
## AIC: 32126
## 
## Number of Fisher Scoring iterations: 6

In geom_density(), the density represents a smoothed estimate of the distribution of observations across depth. It’s similar to a histogram but smoothed into a continuous curve. The uncorrected plot could over-represent depths where more time was spent during each dive, also overlapping dives. The effort-corrected plot gives a better idea of relative abundance per unit effort, making it more comparable across depths.

The statistical tests on depth effect on abundances shows a complex situation. But there is not a linear trend, that is abundances are not decreasing with depth, but that there are peaks. From the GAM model there are three large peaks (this could also be because of the sites…), but the middle peak is the lowest.

Continuing with morphotypes per depth, as overview of how many morphotyes were seen across the depth.

# Count unique morphotypes per depth for each phylum
morpho_diversity <- data.a %>%
  group_by(Depth..m., Phylum) %>% # by each depth now, but can also be done with depth bin, replacing than Depth..m. with depth_bin
  summarise(Unique_Morphotypes = n_distinct(Morphotype), .groups = "drop")

df_corrected <- df_summary %>%
  left_join(dive_duration, by = "Dive.Name") %>% 
  mutate(Duration_hr = as.numeric(Duration) / 3600,  # Convert seconds to hours
         Effort_Corrected_Count = Count / Duration_hr) # Normalize counts by hours

# Step 2: Merge with df_corrected (Ensuring Consistency)
df_corrected <- df_corrected %>%
  left_join(morpho_diversity, by = c("Depth..m.", "Phylum"))  
 
df_corrected <- df_corrected %>% 
   mutate(Effort_Corrected_Div = Unique_Morphotypes / Duration_hr) # Normalize counts by hours

ggplot(df_corrected, aes(x = Depth..m., fill = Phylum, weight = Effort_Corrected_Count)) +
  #geom_density(alpha = 0.7) +  # Density plot with transparency
  geom_smooth(aes(y = Effort_Corrected_Div, color = Phylum), method = "loess", se = TRUE) +
  scale_fill_manual(values = c("Cnidaria" = "#d8514e", "Porifera" = "#f2c714")) +  # Custom color
  coord_flip() +
  labs(x = "Depth (m)", y = "Effort-Corrected Density", fill = "Phylum") +
  theme_minimal() +
  scale_x_reverse(breaks=seq(400, 3200, 200)) 
## `geom_smooth()` using formula = 'y ~ x'

The morphotype density shows that most morphotypes were observed in shallower depths, but a continous diversity below the OMZ. Curiuosly when using the corrected data, it seems as corals are more abundant and more diverse, compared to sponges. This could be due to the effort-correction.

2.2 Summary statistics - environmental data

## Extract metadata for each dive - use dataset B
meta_summary <- data.b[,169:181] #select only the columns that have the environmental data, exclude any morphotype data
meta_summary$Group <-  data.b$Dive.Name
meta_summary <- meta_summary %>%
  group_by(Group) %>%
  summarise_all(list(~ mean(., na.rm = TRUE), ~ sd(., na.rm = TRUE))) # Compute mean and sd

kable(meta_summary) %>%
  kable_styling(font_size = 5, latex_options = c("striped")) 
Group lon_mean oxygen_mean depth_mean temp_mean aspect_mean aspect_rad_mean eastness_mean northness_mean rugosity_mean slope_mean bs_mean feature_mean sed_type_mean lon_sd oxygen_sd depth_sd temp_sd aspect_sd aspect_rad_sd eastness_sd northness_sd rugosity_sd slope_sd bs_sd feature_sd sed_type_sd
2304_02 -156.5281 2.1910415 2234.8660 1.828801 99.62525 1.7387886 0.4664148 0.0157865 8.627509 37.614612 125.86228 1 2.904192 0.0009742 0.0652827 49.99673 0.0235020 70.3430048 1.2277170 0.3543062 0.8132572 6.469064 8.041973 20.900198 0 0.6875268
2304_05 -164.1699 3.1930806 2780.7085 1.610589 277.59723 4.8449856 -0.1471961 0.4206298 29.204248 26.247497 30.25000 5 2.750000 0.0009851 0.0561942 22.62232 0.0076712 94.7809943 1.6542404 0.3946594 0.8361069 11.377150 9.810891 50.651094 0 0.4472136
2304_07 -169.7784 0.9301650 653.8028 3.498381 125.76859 2.1950760 0.0641047 -0.1494093 13.853232 40.198865 76.10465 3 5.000000 0.0012845 0.0830034 70.81022 0.0606151 84.8186776 1.4803652 0.5484088 0.8219816 7.528911 5.314185 84.358706 0 0.0000000
2304_08 -168.8173 1.0963850 1460.3310 2.268787 209.17722 3.6508312 -0.4743283 -0.8509267 8.857124 15.661749 50.75000 1 1.000000 0.0007677 0.0352930 11.34952 0.0266447 13.6620204 0.2384472 0.2000582 0.1246746 2.955730 2.718459 6.510481 0 0.0000000
2306_01 -152.9192 3.4617127 3030.8534 1.561238 23.70798 0.4137823 0.3944582 0.9013083 76.258460 17.336482 84.34667 5 4.360000 0.0005010 0.0566058 40.13850 0.0061754 10.4068466 0.1816337 0.1556529 0.0908370 16.909731 4.310947 20.340011 0 1.4762992
2306_03 -146.3149 0.5129225 805.2009 3.120205 40.24966 0.7024891 0.6451784 0.7620911 50.542931 21.234674 153.23497 3 5.295082 0.0019727 0.0204957 44.00440 0.0944003 3.1251338 0.0545439 0.0423481 0.0343036 20.994250 8.112470 12.886841 0 0.4567039
2306_04 -145.1337 1.6749253 1947.3367 1.971742 125.67085 2.1933702 0.7595301 -0.5538567 61.284492 25.333441 127.41176 3 5.000000 0.0021964 0.1254428 54.19895 0.0453028 20.3086824 0.3544534 0.1563863 0.3060012 8.686628 3.112950 14.455526 0 0.0000000
2306_05 -144.3024 0.6546702 507.0881 3.778363 25.28822 0.4413627 0.3360411 0.9251788 62.130555 28.748697 30.26341 3 4.668293 0.0018735 0.0571162 72.49600 0.0618734 40.8472395 0.7129188 0.1582382 0.0789545 12.422581 6.355581 36.553062 0 0.6240440
2306_06 -141.7489 0.5309211 1102.1208 2.849579 123.98051 2.1638681 0.8124468 -0.5473260 72.130593 31.312636 157.77679 3 5.000000 0.0015295 0.0473178 68.69183 0.1059867 11.6258698 0.2029097 0.1156312 0.1648482 17.508557 7.242374 17.900177 0 0.0000000
2306_07 -140.8336 3.5516965 3224.2976 1.546710 44.42222 0.7753139 0.6924042 0.7069126 86.726845 18.455157 73.56835 3 4.424460 0.0007628 0.0446158 33.79797 0.0060338 8.3553094 0.1458277 0.0995570 0.1053120 9.216274 2.029851 11.127750 0 1.4089693
2306_08 -137.3825 0.6443063 1321.5616 2.571358 24.32856 0.4246134 0.4086943 0.9044223 67.050512 21.405161 NaN 3 5.108696 0.0017920 0.0293136 52.40034 0.0358585 7.0567024 0.1231627 0.1097229 0.0550647 35.784204 9.636119 NA 0 0.7459401
2306_12 -134.5180 1.0701339 1543.5204 2.321652 341.88219 5.9669699 -0.3109539 0.9503646 151.019993 33.012836 79.66327 1 3.714286 0.0010175 0.1196809 52.27767 0.0918113 0.6168777 0.0107665 0.0102285 0.0033603 19.277169 4.302193 11.514172 0 1.4713764
2306_14 -135.4970 0.6271747 713.7097 4.102332 192.73962 3.3639410 -0.1583710 -0.8414947 36.632457 8.602384 167.80556 4 4.631944 0.0018600 0.0231609 8.67190 0.0355006 31.9745955 0.5580620 0.4399844 0.2740161 16.161008 4.730903 16.551795 0 2.9130680
2306_15 -145.7137 2.1287497 2108.7174 1.819657 214.31090 3.7404308 -0.5635357 -0.8257710 175.601596 33.961758 150.10169 1 4.335593 0.0010582 0.0802843 57.57299 0.0196996 1.3210700 0.0230570 0.0192034 0.0127559 27.263599 4.340210 13.588830 0 1.0844859
2306_18 -147.0119 1.3704871 1795.3608 2.096349 343.45519 5.9944239 -0.2607236 0.9182708 106.413534 24.903195 150.02439 2 2.849594 0.0005074 0.1014007 37.82639 0.0442920 17.8050098 0.3107560 0.2352109 0.1839571 16.345194 7.434391 14.057877 0 0.3908910
# What are max/min values for depth, oxygen etc?

max(data.a$Depth..m.)
## [1] 3284.324
min(data.a$Depth..m.)
## [1] 380.2322
max(data.a$Oxygen.Concentration..mg.l.)
## [1] 3.67951
min(data.a$Oxygen.Concentration..mg.l.)
## [1] 0.45809
max(data.a$Salinity..psu.)
## [1] 34.66219
min(data.a$Salinity..psu.)
## [1] 34.07224
# Threshold values for OMZ: <1ml/L or hypoxic <0.5 ml/L - need to calculate in mg/L to make it comparable:
# Conversion factor × 1.429 

print(omz <- 1*1.429) # 1.429 is the threshold for mg/L
## [1] 1.429
print(hypoxic <- 0.5*1.429) # 0.715 is the threshold for a severe hypoxic condition in mg/L 
## [1] 0.7145
# Which depths are inside the OMZ? 

within_omz <- data.a %>% filter(Oxygen.Concentration..mg.l. < omz)
min(within_omz$Depth..m.)
## [1] 380.2322
max(within_omz$Depth..m.) 
## [1] 1846.175

2.3 Correlations - environmental variables

Check if any of the environmental variables are correlated, this is important for the analysis, and can be used to exclude variables, simplifying the further analysis.

# Compute Pearson correlation matrix
env.pearson <- cor(data.a[,17:22], use = "pairwise.complete.obs") 
round(env.pearson, 2) 
##                             Latitude..deg. Longitude..deg.
## Latitude..deg.                        1.00            0.63
## Longitude..deg.                       0.63            1.00
## Oxygen.Concentration..mg.l.           0.17            0.07
## Temperature..C.                      -0.43           -0.21
## Depth..m.                             0.38            0.29
## Salinity..psu.                        0.54            0.42
##                             Oxygen.Concentration..mg.l. Temperature..C.
## Latitude..deg.                                     0.17           -0.43
## Longitude..deg.                                    0.07           -0.21
## Oxygen.Concentration..mg.l.                        1.00           -0.81
## Temperature..C.                                   -0.81            1.00
## Depth..m.                                          0.92           -0.94
## Salinity..psu.                                     0.75           -0.97
##                             Depth..m. Salinity..psu.
## Latitude..deg.                   0.38           0.54
## Longitude..deg.                  0.29           0.42
## Oxygen.Concentration..mg.l.      0.92           0.75
## Temperature..C.                 -0.94          -0.97
## Depth..m.                        1.00           0.94
## Salinity..psu.                   0.94           1.00
# Pairplot with correlation coefficients
ggpairs(data.a[,17:22], 
        lower = list(continuous = wrap("smooth", color = "blue")),  # Smoothed regression lines
        upper = list(continuous = wrap("cor", size = 5)),  # Pearson correlation values
        diag = list(continuous = wrap("barDiag", fill = "lightblue"))) +  # Histogram on diagonals
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Correlation plot
corrplot(env.pearson, method = "circle", type = "upper", tl.col = "black", tl.cex = 0.8, addCoef.col = "black")

2.3.1 Temperature-Oxygen and Temperature-Salinity Plots

Figure 2.

# Create the plot 1) TEMP vs Oxgyen
ggplot(data.a, aes(y = Temperature..C., x = Oxygen.Concentration..mg.l., color = Depth..m.)) +
  geom_point(size = 1) +    # Scatter plot with color representing depth
  #scale_color_gradient(name = "Depth (m)", low = "blue", high = "red") +  # Color gradient
  scale_color_viridis(option = "viridis", direction = -1) +  # Color scale with cividis for colorblind-friendly option
  #scale_y_reverse() +  # Reverse y-axis to put shallow depth on top
  xlab("Oxygen (mg/L)") +                         # X-axis label
  ylab("Temperature (°C)") +                      # Left y-axis label
  theme_linedraw() +
  theme(panel.grid = element_blank(),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        axis.title.x = element_text (size = 14),
        axis.title.y = element_text(size = 14),
        legend.background = element_blank(),
        legend.box.background= element_rect(colour="black"),
        legend.position = "right")

# Create the plot 2) Salinity vs Temperature
ggplot(data.a, aes(y = Temperature..C., x = Salinity..psu., color = Depth..m.)) +
  geom_point(size = 1) +    # Scatter plot with color representing depth
  #scale_color_gradient(name = "Depth (m)", low = "blue", high = "red") +  # Color gradient
  scale_color_viridis(option = "viridis", direction = -1) +  # Color scale with cividis for colorblind-friendly option
  #scale_y_reverse() +  # Reverse y-axis to put shallow depth on top
  xlab("Salinity (psu)") +                         # X-axis label
  ylab("Temperature (°C)") +                      # Left y-axis label
  theme_linedraw() +
  theme(panel.grid = element_blank(),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        axis.title.x = element_text (size = 14),
        axis.title.y = element_text(size = 14),
        legend.background = element_blank(),
        legend.box.background= element_rect(colour="black"),
        legend.position = "right")

3 DATA MANIPULATION

3.1 Aggregate observations

The observations are binned into 5 meter depth bins to simplify and aggregate data and reduce computation time. Otherwise, some analyses take a long time. Here we will be using the Dataset B - the wide data.

depth_bins <- seq(0, max(data.b$depth, na.rm = TRUE)+100, by = 5) # in 5 meter bins, indicated by=5 
depth_labels <- paste0(depth_bins[-length(depth_bins)] + 1, "-", depth_bins[-1]) #add labels for each depth bin

# Add depth bin column to the dataframe
data.b <- data.b %>% 
  mutate(depth_bin = cut(depth, breaks = depth_bins, labels = depth_labels, include.lowest = TRUE)) 

# Aggregate species data by Dive.Name and Depth_bin
df_species_aggregated <- data.b %>%
  group_by(Dive.Name, depth_bin) %>% #Dive.Name is the column that indiciates the site/location (E.g. EX2304_07 is Uliaga seamount)
  summarise(across(3:166, sum, .names = "{.col}"), .groups = "drop") #across all columns with taxa

# Aggregate environmental variables by Dive.Name and Depth_bin
# Function to calculate the mode, for this numeric value it takes the mean
get_mode <- function(x) {
  x <- na.omit(x)
  if(length(x) == 0) return(NA)
  uniq_vals <- unique(x)
  uniq_vals[which.max(tabulate(match(x, uniq_vals)))]}

# Ensure lat/lon are numeric before aggregation, otherwise there might be an error 
data.b$lat <- as.numeric(data.b$lat)
data.b$lon <- as.numeric(data.b$lon)

#Aggregating with mean for numeric columns and most common for character columns
df_env_aggregated <- data.b %>%
  group_by(Dive.Name, depth_bin) %>% 
  summarise(across(167:179, mean, na.rm = TRUE),  # Numeric columns - here might appear an error in newer dyplr versions, but does the same thing! 
            sed_type = get_mode(sed_type),  #since this is a character, we need to use the primary sediment type (most common one) 
            sed_consistency = get_mode(sed_consistency), #the same here
            .groups = "drop")

# Merge species and environmental data
data.b <- left_join(df_species_aggregated, df_env_aggregated, by = c("Dive.Name", "depth_bin")) 
#OBS for Dive 2306-08 its NA because backscatter (bs) had no values due to technical errors during the dive

#Add the "water mass and/or current" to the dataset 
data.b <- data.b %>%
  mutate(watermass = case_when(
    grepl("2306_04|2306_08", Dive.Name) ~ "PDW", #PDW (without LCDW)
    grepl("2306_03|2306_05|2306_06", Dive.Name) ~ "PSIW", #Gulf of Alaska Gyre (Alaska current + eddies) = PSIW water mass 
    grepl("2304_07|2304_08", Dive.Name) ~ "ANSC" , #ANSC + Samalga Pass
    grepl("2306_12|2306_14|2306_15|2306_18", Dive.Name) ~ "ACC" , #ACC Alaska coastal current, on the shelf
    grepl("2304_02|2304_05|2306_01|2306_07", Dive.Name) ~ "PDW/LCDW" #PDW/LCDW water mass - deepest locations too 
  ))

#write.xlsx(data.b, "aggregated_data.xlsx") #to check the df if needed

3.2 Extract individual columns

#Extract individual df for coordinates, species data and env data
#coord <-data.b[,167:168] #select the columns that have the lat/lon

coord <- data.frame(
  lat = round(data.b[, 167], 6),
  lon = round(data.b[, 168], 6))
ENV   <-data.b[,169:181] #all the env data
ENV   <- ENV[,-c(4:5)] #remove aspect and aspect_rad, not needed 
SPE   <-data.b[,3:166] #all annotation/taxa data
#write.xlsx(SPE, "SPE_data.xlsx") #to check the df if needed

3.3 Create Dataset C

In some cases, a dataset is needed where all data is aggregated by site. Here, I will call this Dataset C (short dataset) in contrast to Dataset B (full dataset, rows being annotation timestamps)

#Use Dive.Name to combine/summarise and create a new combined df
data.c <- data.b %>%
  group_by(Dive.Name) %>%
  summarise(
    across(all_of(colnames(SPE)), sum, .names = "{.col}"),  # Sum for SPE columns
    sed_type = get_mode(sed_type),  #since this is a character, we need to use the primary sediment type (most common one) 
    sed_consistency = get_mode(sed_consistency), #the same here
    across(where(is.numeric) & all_of(colnames(ENV)), mean, na.rm = TRUE, .names = "{.col}"),  # Mean for numeric ENV columns
    across(all_of(colnames(coord)), first, .names = "{.col}"),  # First for coord columns
    .groups = "drop"
  ) %>%
  as.data.frame()

data.c <- data.c %>%
  mutate(watermass = case_when(
    grepl("2306_04|2306_08", Dive.Name) ~ "PDW", #PDW (without LCDW)
    grepl("2306_03|2306_05|2306_06", Dive.Name) ~ "PSIW", #Gulf of Alaska Gyre (Alaska current + eddies) = PSIW water mass 
    grepl("2304_07|2304_08", Dive.Name) ~ "ANSC" , #ANSC + Samalga Pass
    grepl("2306_12|2306_14|2306_15|2306_18", Dive.Name) ~ "ACC" , #ACC Alaska coastal current, on the shelf
    grepl("2304_02|2304_05|2306_01|2306_07", Dive.Name) ~ "PDW/LCDW" #PDW/LCDW water mass - deepest locations too 
  ))

Now I want to create again SPE and ENV dfs from the combined dataset C

ENV2   <-data.c[,166:176] #all the env data
SPE2   <-data.c[,2:165] #all annotation/taxa data
#write.xlsx(SPE2, "SPE_data.xlsx") #to check the df if needed 

3.4 Create Dataset FL (Family Level)

Since there might be patterns in the family instead of morphotypes, the data is aggregated into family level.

# Merge taxa information with the column names of SPE, Define a function to replace NA values with the next available information, and replace empty strings with NA
taxa[taxa == ""] <- NA
fill_family <- function(df) {
  df %>%
    mutate(Family = ifelse(is.na(Family),
                           ifelse(!is.na(Class), Class, Phylum),
                           Family))}

taxa_filled <- fill_family(taxa)
column_names <- data.frame(Morphotype = colnames(SPE), stringsAsFactors = FALSE)
taxa_mapping <- taxa_filled %>%
  dplyr::select(Morphotype, Family) %>%
  dplyr::mutate(Family = as.factor(Family))

# Merge column names with taxa information
column_info <- merge(column_names, taxa_mapping, by = "Morphotype", all.x = TRUE)
# Group columns by family
grouped_columns <- column_info %>%
  group_by(Family) %>%
  summarise(Morphotype = list(Morphotype), .groups = 'drop')

# Initialize a new data frame for combined data
SPE.fam <- data.frame(matrix(ncol = nrow(grouped_columns), nrow = nrow(SPE)))
colnames(SPE.fam) <- grouped_columns$Family

# Combine columns by family
for (i in 1:nrow(grouped_columns)) {
  family_columns <- grouped_columns$Morphotype[[i]]
  # Ensure that the selected columns are treated as a data frame
  selected_data <- SPE[ , family_columns, drop = FALSE]
  # Sum columns belonging to the same family
  SPE.fam[[i]] <- rowSums(selected_data, na.rm = TRUE)
}

# print to check
# SPE.fam

4 DISTRIBUTION PATTERNS

Here the morphotype data is explored in more detail. Which family most speciose and so on.

# Which is the most abundant morphotype?
# Sum the counts for each morphotype (column) across all locations (rows)
total_abundance <- colSums(SPE)
# Sort the families by total abundance in descending order
most_abundant_taxa <- sort(total_abundance, decreasing = TRUE)
# View the sorted list of most abundant taxa
most_abundant_taxa
##  PD18  PD03 CAO36 CAO48  PH50 CAO18 CAH02  PD05  PH10  PH23  PD22 CAO51 CAO14 
##  1185  1184  1093   696   652   593   549   543   525   524   505   500   387 
## CAO54 CAO47   P06 CAO02  PD02 CAO16  PH12  PH51 CAO24  PH17 CAO49  PD19 CAO50 
##   376   367   350   318   314   314   299   285   285   219   204   200   196 
##  PD32 CAO10 CAO13 CAO37  PH09 CAO15 CAO43  PD16 CAO62  PH40  PH47  PH48  PD21 
##   159   152   147   146   130   123   118    88    83    82    79    75    67 
##  PH04  PH55 CAO27 CAH09  PH57  PD23 CAO19 CAO25 CAH06 CAO61  PD04 CAH10 CAO28 
##    65    62    57    54    50    50    49    48    44    37    35    33    33 
##  PD07 CAO52  PH08  PD34  PD27   P02   P09  PH53 CAO58  PD14 CAO05  PH43  PD25 
##    32    32    30    28    26    25    25    23    23    23    22    21    20 
## CAO56 CAO04  PD08 CAO33 CAH19  PH41 CAH08 CAO29  PD28  PD13 CAH11  PH39  PH49 
##    20    18    18    18    17    16    14    14    13    12    12    12    11 
## CAO55 CAO65  PH35  PH05  PH33 CAO60   P04  PH32 CAO20  PH31 CAO08  PH54 CAO30 
##    11    11    10     9     9     9     9     9     8     8     8     7     7 
## CAH18  PH46  PH07 CAO26  PH38  PD29  PH14  PH27 CAH05 CAO09  PD24   P03 CAO45 
##     7     6     6     6     6     5     5     5     4     4     4     4     4 
##  PH01  PH03 CAO06  PD31 CAO23  PH22 CAH12 CAO35 CAH16 CAO41 CAO07 CAO64  PH52 
##     3     3     3     3     3     3     3     3     3     3     2     2     2 
##  PH06 CAO11 CAO59  PH18  PH20  PH24 CAO31  PH30  PD33 CAH14 CAH15 CAO44  PH58 
##     2     2     2     2     2     2     2     2     2     2     2     2     2 
##   P01 CAO03  PH02  PD26 CAO17  PD30  PD35  PD09 CAO22 CAO57  PD06  PH15  PH16 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  PH19  PH21  PH25 CAO63 CAO32  PH26 CAO46  PH28  PH29   P05 CAO34 CAO66 CAO67 
##     1     1     1     1     1     1     1     1     1     1     1     1     1 
##  PD20   P07 CAO42   P08  PH36  PH37  PH42 CAO68 
##     1     1     1     1     1     1     1     1
# Which is the most abundant family?
# Sum the counts for each morphotype (column) across all locations (rows)
total_abundance_fam <- colSums(SPE.fam)
most_abundant_family<- sort(total_abundance_fam, decreasing = TRUE)
most_abundant_family 
##    Cladorhizidae       Primnoidae       Corallidae        Farreidae 
##             3434             2347             1332             1188 
##     Balticinidae     Demospongiae      Rossellidae  Keratoisididae  
##             1150              943              837              806 
##   Schizopathidae   Euplectellidae         Porifera      Plexauridae 
##              732              648              391              348 
## Aphrocallistidae Acanthogorgiidae        Euretidae     Octocorallia 
##              310              206              206              196 
##   Anthoptilidae     Vulcanellidae   Hexactinellida  Chrysogorgiidae 
##              126              116               63               30 
##       Chalinidae        Mycalidae   Pheronematidae      Alcyoniidae 
##               28               26               24               20 
##    Cladopathidae     Umbellulidae     Pennatulidae     Hexacorallia 
##               10                6                4                2 
##  Kophobelemnidae 
##                2
# Which taxa/families are depth generalists? Which are more specialised? (Occuring in only one depth e.g.)

# Combine species data with depth information
species_depth <- cbind(SPE.fam, Depth = ENV$depth)

# Count how many unique depths each morphotype appears in
species_depth_counts <- species_depth %>%
  summarise(across(-Depth, ~length(unique(Depth[. > 0])))) %>%
  t() %>%
  as.data.frame()

colnames(species_depth_counts) <- "Unique_Depths"
species_depth_counts$Species <- rownames(species_depth_counts)

# View family with the most and least depth occurrences
species_depth_counts <- species_depth_counts[order(species_depth_counts$Unique_Depths), ]

# Define threshold (e.g., species in < 10 depths = specialist, otherwise generalist)
species_depth_counts$Category <- ifelse(species_depth_counts$Unique_Depths < 10, "Specialist", "Generalist")

# Convert SPE to long format for ggplot
species_long <- SPE.fam %>%
  mutate(Depth = ENV$depth) %>%
  pivot_longer(cols = -Depth, names_to = "Species", values_to = "Abundance") %>%
  filter(Abundance > 0)  # Remove absences

# Calculate depth range for each species
species_ranges <- species_long %>%
  group_by(Species) %>%
  summarise(MinDepth = min(Depth), MaxDepth = max(Depth)) %>%
  mutate(DepthRange = MaxDepth - MinDepth,
         Category = ifelse(DepthRange > 1000, "Generalist", "Specialist"))

# Merge category information with long species data
species_long <- species_long %>%
  left_join(species_ranges, by = "Species")

# Plot with generalists highlighted
ggplot(species_long, aes(x = reorder(Species, Depth), y = Depth, color = Category)) +
  geom_boxplot() +
  coord_flip() +
  scale_color_manual(values = c("Generalist" = "red", "Specialist" = "black")) + 
  labs(y = "Depth (m)", x = "Species",
       title = "Species Depth Ranges",
       color = "Category") +
  theme_minimal()

5 GARDEN ANALYSIS

First step is to go through each dive transect, and see where the individual counts are the highest. For this a simple histogram by depth is needed.

5.1 Selection of high densitites

#the time is calculated in seconds
min <- 10
seconds <- min*60
divesites <- unique(data.a$Dive.Name)
plotList = vector( "list", length = 15)
 
for (i in 1:15){ # 15 dive sites
  site <- c()
  site[[ i ]] <- subset(data.a, Dive.Name == divesites[i]) # first create subset for each dive 
  plotList[[ i ]] = ggplot( site[[ i ]], aes(x=Start.Time, y = ..density..)) +
    geom_histogram(binwidth= seconds, alpha=1.5) +
    geom_density(size=1, color="blue")
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(plotList)
## [[1]]
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

This was analysed manually, inspecting the plot for high density peaks. The following decisions for taken: Dives with high densities: EX2304: 02 + 07. EX2306: 03 + 05 + 14 + 15 + 18 + Maybe 07? + Maybe 12? Those dives were inspected again, and the timing of the high densitites specified. More details can be found in the next table:

5.2 Data completion

The following is a script to add data to the garden_density_df - as given in the supplementary file. Just to explain what has been done.

kable(garden_length_df) %>%
  kable_styling(font_size = 7, latex_options = c("striped"))
expedition dive_number dive_name potential_garden time_start time_end total.time frame.grab.every.min frame.grab.1 frame.ID.1 frame.grab.2 frame.ID.2 frame.grab.3 frame.ID.3 frame.grab.4 frame.ID.4 frame.grab.5 frame.ID.5 frame.grab.6 frame.ID.6 frame.grab.7 frame.ID.7 frame.grab.8 frame.ID.8 frame.grab.9 frame.ID.9 frame.grab.10 frame.ID.10 time_loc_1 lon_loc_1 lat_loc_1 depth_loc_1 time_loc_2 lon_loc_2 lat_loc_2 depth_loc_2 lasers_on dominating_fauna elevation comment
EX2304 2 230402 2304-2A 7/16/23 20:36 7/16/23 22:05 89.16667 8.92 20:36 1_203909 20:45 2_205111 20:54 3_205459 21:03 4_205735 21:12 5_211847 21:21 6_212126 21:30 7_213129 21:39 8_215013 9:48:00 PM (no good frame) 9_213521 9:57:00 PM (no good frame) 10_210645 7/16/23 20:36 -156.5278 54.75417 2269.5956 7/16/23 22:05 -156.5288 54.75486 2202.2417 yes sponges (PD03, PD02, PD05) flat sticks
EX2304 7 230407 2304-7A 7/23/23 17:39 7/24/23 0:06 386.73333 38.67 17:39 1_174453 18:20 2_181239 19:00 3_185428 19:40 4_193557 20:20 5_201919 21:00 6_212438 21:40 7_215329 22:20 8_222640 23:00 9_230000 23:40 10_233941 7/23/23 17:39 -169.7760 53.29347 777.2718 7/24/23 0:06 -169.7804 53.29140 532.4322 yes sponge and coral slight incline diverse
EX2306 3 230603 2306-3A 8/26/23 19:25 8/26/23 20:50 84.88333 8.49 19:30 1_193235 19:40 2_194145 19:50 3_nolasers_201343 20:00 4_200713 8:10:00 PM (no lasers) 5_202350 20:20 6_203608 20:30 7_205053 20:40 8_193355 20:50 9_nolasers_201453 21:00 10_202527 8/26/23 19:25 -146.3133 56.49172 838.7715 8/26/23 20:50 -146.3151 56.49096 787.8098 check check video
EX2306 5 230605 2306-5A 8/28/23 20:10 8/28/23 22:47 157.76667 15.78 20:10 1_201328 20:25 2_nolasers_202631 20:40 3_nolasers204228 8:55:00 PM (no lasers) 4_213425 9:10:00 PM (no lasers) 5_214745 9:25:00 PM (no lasers) 6_225655 21:40 7_230032 21:55 8_201508 10:10:00 PM (changed to earlier as no lasers) 9_232900 10:25:00 PM (changed to earlier as no frames) 10_202353 8/28/23 20:10 -144.3007 56.08964 579.7123 8/28/23 22:47 -144.3043 56.08717 433.9817 no lasers until 21:34
EX2306 7 230607 2306-7A 8/30/23 19:06 8/30/23 21:35 148.88333 14.89 19:06 1_190612 19:20 2_192047 7:45:00 PM (no lasers) 3_211342 8:00:00 PM (no lasers) 4_211700 8:15:00 PM (no lasers) 5_213502 8:30:00 PM (no lasers) 6_nolasers_200347 8:45:00 PM (no lasers) 7_nolasers_201827 9:00:00 PM (no lasers) 8_nolasers_203000 21:15 9_nolasers_205715 21:30 10_193037 8/30/23 19:06 -140.8327 55.01326 3276.3970 8/30/23 21:35 -140.8339 55.01235 3193.2476 flat
EX2306 12 230612 2306-12A 9/5/23 19:30 9/5/23 21:29 119.95000 12.00 19:30 1_193344 19:42 2_202339 19:54 3_203032 20:06 4_210207 20:12 5_210651 20:24 6_211730 20:36 7_213513 20:48 8_214117 21:00 9_nolasers_20000 21:12 10_223405 9/5/23 19:30 -134.5189 55.03189 1585.8863 9/5/23 21:29 -134.5176 55.03155 1532.3117 flat sticks
EX2306 15 230615 2306-15A 9/10/23 20:00 9/10/23 23:15 195.00000 19.50 20:00 EX2306_Dive15 Middleton Canyon_20230910T200340.000Z-001 20:20 EX2306_Dive15 Middleton Canyon_20230910T202410.000Z-001 20:40 EX2306_Dive15 Middleton Canyon_20230910T203735.000Z-001 21:00 EX2306_Dive15 Middleton Canyon_20230910T205500.000Z-001 21:20 EX2306_Dive15 Middleton Canyon_20230910T212710.000Z-001 21:40 EX2306_Dive15 Middleton Canyon_20230910T213925.000Z-001 22:00 EX2306_Dive15 Middleton Canyon_20230910T215955.000Z-001 22:20 8_EX2306_Dive15 Middleton Canyon_20230910T221630.000Z-001 22:40 9_EX2306_Dive15 Middleton Canyon_20230910T223920.000Z-001 23:00 10_EX2306_Dive15 Middleton Canyon_20230910T230005.000Z-001 9/10/23 20:03 -145.7151 59.24847 2173.3429 9/10/23 23:14 -145.7120 59.24975 2009.0146
EX2306 18 230618 2306-18A 9/12/23 20:16 9/12/23 20:35 19.11667 1.91 20:16 20:18 20:20 20:22 20:24 20:26 20:28 20:30 20:32 20:34 9/12/23 20:16 -147.0123 59.07392 1823.8740 9/12/23 20:35 -147.0119 59.07398 1818.4246 no wall wall 1. no lasers
EX2306 18 230618 2306-18B 9/12/23 22:00 9/12/23 22:50 50.13333 5.01 22:00 1_220228 22:05 2_220940 22:10 3_221347 22:15 4_221905 22:20 5_222621 22:25 6_223233 22:30 7_224327 22:35 8_224429 22:40 9_224628 22:45 10_223200 9/12/23 22:00 -147.0116 59.07334 1763.9329 9/12/23 22:50 -147.0113 59.07323 1746.3971 wall wall 2
# Function to calculate distance, returning NA if any coordinate is NA
calculate_distance <- function(lat1, lon1, lat2, lon2) {
  if (is.na(lat1) | is.na(lon1) | is.na(lat2) | is.na(lon2)) {
    return(NA)
  } else {
    return(distHaversine(c(lon1, lat1), c(lon2, lat2)))
  }
}

# Calculate the horizontal distance between the coordinates
garden_length_df$horizontal_distance_m <- mapply(calculate_distance, garden_length_df$lat_loc_1, garden_length_df$lon_loc_1, garden_length_df$lat_loc_2, garden_length_df$lon_loc_2)

# Calculate the vertical distance (depth difference) in meters
calculate_vertical_distance <- function(depth1, depth2) {
  if (is.na(depth1) | is.na(depth2)) {
    return(NA)
  } else {
    return(abs(depth2 - depth1))
  }
}

garden_length_df$vertical_distance_m <- mapply(calculate_vertical_distance, garden_length_df$depth_loc_1, garden_length_df$depth_loc_2)

# Function to calculate depth distance (using pythagoras formula)
calculate_3D_distance <- function(lat1, lon1, depth1, lat2, lon2, depth2) {
  if (is.na(lat1) | is.na(lon1) | is.na(depth1) | is.na(lat2) | is.na(lon2) | is.na(depth2)) {
    return(NA)
  } else {
    # Calculate horizontal distance in meters using Haversine formula
    horizontal_distance <- distHaversine(c(lon1, lat1), c(lon2, lat2))
    # Calculate vertical distance in meters
    vertical_distance <- abs(depth2 - depth1)
    # Calculate 3D distance using Pythagorean theorem
    distance_3d <- sqrt(horizontal_distance^2 + vertical_distance^2)
    return(distance_3d)
  }
}

# Calculate the 3D distance between the coordinates - total distance 
garden_length_df$distance_3D_m <- mapply(calculate_3D_distance, garden_length_df$lat_loc_1, garden_length_df$lon_loc_1, garden_length_df$depth_loc_1, garden_length_df$lat_loc_2, garden_length_df$lon_loc_2, garden_length_df$depth_loc_2)

# Convert Start.Date to POSIXct
data.a$Start.Date <- ymd_hms(data.a$Start.Date, tz = "UTC")
data.a$time <- format(data.a$Start.Date, "%Y-%m-%d %H:%M:%S UTC")
# Format the time column based on Start.Date and convert to POSIXct
data.a$time <- as.POSIXct(format(data.a$Start.Date, "%Y-%m-%d %H:%M:%S UTC"), tz = "UTC")

# Convert the whole column to the desired format
garden_length_df <- garden_length_df %>%
   mutate(time_start = mdy_hm(time_start),  # Parse the date-time
         time_start = format(with_tz(time_start, "UTC"), "%Y-%m-%d %H:%M:%S UTC"),  # Convert to UTC and format
         time_end = mdy_hm(time_end),  
         time_end = format(with_tz(time_end, "UTC"), "%Y-%m-%d %H:%M:%S UTC")) 

# Also order the rows by time, as right now its not bc of the expansion of the df (counts)
data.a <- data.a %>% arrange(time)

# number of observations in each garden
garden_length_df <- garden_length_df %>%
  mutate(n = sapply(1:nrow(garden_length_df), function(i) {
    # Get the start and end times for the current region in UTC
    start_time <- ymd_hms(garden_length_df$time_start[i], tz = "UTC")
    end_time <- ymd_hms(garden_length_df$time_end[i], tz = "UTC")
    # Count the number of rows in data.a where time falls within the range of start_time and end_time
    num_rows <- nrow(data.a %>% filter(time >= start_time & time <= end_time))
    return(num_rows)
  }))

#Ind_min#
garden_length_df$ind_min <- garden_length_df$n / as.numeric(garden_length_df$total.time)

#Ind_m# (using the total distance 3D)
garden_length_df$ind_m_traveled <- garden_length_df$n / garden_length_df$distance_3D_m 

kable(garden_length_df) %>%
  kable_styling(font_size = 5, latex_options = c("striped"))
expedition dive_number dive_name potential_garden time_start time_end total.time frame.grab.every.min frame.grab.1 frame.ID.1 frame.grab.2 frame.ID.2 frame.grab.3 frame.ID.3 frame.grab.4 frame.ID.4 frame.grab.5 frame.ID.5 frame.grab.6 frame.ID.6 frame.grab.7 frame.ID.7 frame.grab.8 frame.ID.8 frame.grab.9 frame.ID.9 frame.grab.10 frame.ID.10 time_loc_1 lon_loc_1 lat_loc_1 depth_loc_1 time_loc_2 lon_loc_2 lat_loc_2 depth_loc_2 lasers_on dominating_fauna elevation comment horizontal_distance_m vertical_distance_m distance_3D_m n ind_min ind_m_traveled
EX2304 2 230402 2304-2A 2023-07-16 20:36:00 UTC 2023-07-16 22:05:00 UTC 89.16667 8.92 20:36 1_203909 20:45 2_205111 20:54 3_205459 21:03 4_205735 21:12 5_211847 21:21 6_212126 21:30 7_213129 21:39 8_215013 9:48:00 PM (no good frame) 9_213521 9:57:00 PM (no good frame) 10_210645 7/16/23 20:36 -156.5278 54.75417 2269.5956 7/16/23 22:05 -156.5288 54.75486 2202.2417 yes sponges (PD03, PD02, PD05) flat sticks 98.27062 67.3539 119.13716 954 10.699065 8.007578
EX2304 7 230407 2304-7A 2023-07-23 17:39:00 UTC 2023-07-24 00:06:00 UTC 386.73333 38.67 17:39 1_174453 18:20 2_181239 19:00 3_185428 19:40 4_193557 20:20 5_201919 21:00 6_212438 21:40 7_215329 22:20 8_222640 23:00 9_230000 23:40 10_233941 7/23/23 17:39 -169.7760 53.29347 777.2718 7/24/23 0:06 -169.7804 53.29140 532.4322 yes sponge and coral slight incline diverse 371.90054 244.8396 445.25997 4619 11.943630 10.373715
EX2306 3 230603 2306-3A 2023-08-26 19:25:00 UTC 2023-08-26 20:50:00 UTC 84.88333 8.49 19:30 1_193235 19:40 2_194145 19:50 3_nolasers_201343 20:00 4_200713 8:10:00 PM (no lasers) 5_202350 20:20 6_203608 20:30 7_205053 20:40 8_193355 20:50 9_nolasers_201453 21:00 10_202527 8/26/23 19:25 -146.3133 56.49172 838.7715 8/26/23 20:50 -146.3151 56.49096 787.8098 check check video 139.56314 50.9617 148.57646 565 6.656195 3.802756
EX2306 5 230605 2306-5A 2023-08-28 20:10:00 UTC 2023-08-28 22:47:00 UTC 157.76667 15.78 20:10 1_201328 20:25 2_nolasers_202631 20:40 3_nolasers204228 8:55:00 PM (no lasers) 4_213425 9:10:00 PM (no lasers) 5_214745 9:25:00 PM (no lasers) 6_225655 21:40 7_230032 21:55 8_201508 10:10:00 PM (changed to earlier as no lasers) 9_232900 10:25:00 PM (changed to earlier as no frames) 10_202353 8/28/23 20:10 -144.3007 56.08964 579.7123 8/28/23 22:47 -144.3043 56.08717 433.9817 no lasers until 21:34 351.88581 145.7306 380.86878 808 5.121487 2.121465
EX2306 7 230607 2306-7A 2023-08-30 19:06:00 UTC 2023-08-30 21:35:00 UTC 148.88333 14.89 19:06 1_190612 19:20 2_192047 7:45:00 PM (no lasers) 3_211342 8:00:00 PM (no lasers) 4_211700 8:15:00 PM (no lasers) 5_213502 8:30:00 PM (no lasers) 6_nolasers_200347 8:45:00 PM (no lasers) 7_nolasers_201827 9:00:00 PM (no lasers) 8_nolasers_203000 21:15 9_nolasers_205715 21:30 10_193037 8/30/23 19:06 -140.8327 55.01326 3276.3970 8/30/23 21:35 -140.8339 55.01235 3193.2476 flat 127.90890 83.1494 152.55985 557 3.741184 3.651026
EX2306 12 230612 2306-12A 2023-09-05 19:30:00 UTC 2023-09-05 21:29:00 UTC 119.95000 12.00 19:30 1_193344 19:42 2_202339 19:54 3_203032 20:06 4_210207 20:12 5_210651 20:24 6_211730 20:36 7_213513 20:48 8_214117 21:00 9_nolasers_20000 21:12 10_223405 9/5/23 19:30 -134.5189 55.03189 1585.8863 9/5/23 21:29 -134.5176 55.03155 1532.3117 flat sticks 88.68787 53.5746 103.61359 486 4.051688 4.690505
EX2306 15 230615 2306-15A 2023-09-10 20:00:00 UTC 2023-09-10 23:15:00 UTC 195.00000 19.50 20:00 EX2306_Dive15 Middleton Canyon_20230910T200340.000Z-001 20:20 EX2306_Dive15 Middleton Canyon_20230910T202410.000Z-001 20:40 EX2306_Dive15 Middleton Canyon_20230910T203735.000Z-001 21:00 EX2306_Dive15 Middleton Canyon_20230910T205500.000Z-001 21:20 EX2306_Dive15 Middleton Canyon_20230910T212710.000Z-001 21:40 EX2306_Dive15 Middleton Canyon_20230910T213925.000Z-001 22:00 EX2306_Dive15 Middleton Canyon_20230910T215955.000Z-001 22:20 8_EX2306_Dive15 Middleton Canyon_20230910T221630.000Z-001 22:40 9_EX2306_Dive15 Middleton Canyon_20230910T223920.000Z-001 23:00 10_EX2306_Dive15 Middleton Canyon_20230910T230005.000Z-001 9/10/23 20:03 -145.7151 59.24847 2173.3429 9/10/23 23:14 -145.7120 59.24975 2009.0146 228.13187 164.3283 281.15465 728 3.733333 2.589322
EX2306 18 230618 2306-18A 2023-09-12 20:16:00 UTC 2023-09-12 20:35:00 UTC 19.11667 1.91 20:16 20:18 20:20 20:22 20:24 20:26 20:28 20:30 20:32 20:34 9/12/23 20:16 -147.0123 59.07392 1823.8740 9/12/23 20:35 -147.0119 59.07398 1818.4246 no wall wall 1. no lasers 24.32872 5.4494 24.93156 222 11.612903 8.904378
EX2306 18 230618 2306-18B 2023-09-12 22:00:00 UTC 2023-09-12 22:50:00 UTC 50.13333 5.01 22:00 1_220228 22:05 2_220940 22:10 3_221347 22:15 4_221905 22:20 5_222621 22:25 6_223233 22:30 7_224327 22:35 8_224429 22:40 9_224628 22:45 10_223200 9/12/23 22:00 -147.0116 59.07334 1763.9329 9/12/23 22:50 -147.0113 59.07323 1746.3971 wall wall 2 17.67383 17.5358 24.89716 370 7.380319 14.861131
#To save this file use:
#library(openxlsx)
#write.xlsx(garden_length_df, "output.xlsx")

Now we have detailed information for each forest. Next the density data will be analysed. Th

5.3 Density data for forest analysis

I put this in the same df as the length data, but as sheets. Now I saved each individual sheet as a file, to be able to work with it. All files are availabe from the Supplementary File (just save each sheet separately)

# Read all garden files
# You may need to adapt the path

file_paths <- c(
  "./garden densities/D02_2304_G02A.xlsx", "./garden densities/D03_2306_G03A.xlsx",
  "./garden densities/D05_2306_G05A.xlsx", "./garden densities/D07_2304_G07A.xlsx",
  "./garden densities/D07_2306_G07A.xlsx", "./garden densities/D12_2306_G12A.xlsx",
  "./garden densities/D15_2306_G15A.xlsx", "./garden densities/D18_2306_G18B.xlsx"
)

garden_ids <- c("D02_2304_G02A", "D03_2306_G03A", "D05_2306_G05A", "D07_2304_G07A", "2306-G07A", "D12_2306_G12A", "D15_2306_G15A", "D18_2306_G18B")

# Read data and assign garden_ids
list_of_dfs <- lapply(seq_along(file_paths), function(i) {
  df <- read_excel(file_paths[i])
  df$garden_id <- garden_ids[i]
  return(df)
})

# Function to calculate average density per morphotype
summarize_density_df <- function(df) {
  df %>%
    group_by(garden_id) %>%
    summarise(across(starts_with("P") | starts_with("C"), ~ mean(.x / area_squared, na.rm = TRUE))) %>%
    pivot_longer(cols = -garden_id, names_to = "Morphotype", values_to = "density")
}

# Apply function to all garden data
gardens_df <- bind_rows(lapply(list_of_dfs, summarize_density_df), .id = "df_id")

# Join with taxa data (assumed to be available)
gardens_df <- gardens_df %>%
  left_join(taxa, by = "Morphotype")

# Plot function to simplify repeated plot code
plot_density <- function(data, group_col, fill_col, title) {
  data %>%
    group_by(garden_id, !!sym(group_col)) %>%
    summarise(total_density = sum(density)) %>%
    mutate(prop_density = total_density / sum(total_density)) %>%
    ggplot(aes(x = garden_id, y = total_density, fill = !!sym(group_col))) +
    geom_bar(stat = "identity", aes(weight = prop_density), position = "stack") +
    labs(x = "Garden ID", y = "Individuals per m^2", title = title) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

# Plot densities by Phylum, Morphotype, and Family
plot_density(gardens_df, "Phylum", "Phylum", "Densities by Phylum")
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
## Warning in geom_bar(stat = "identity", aes(weight = prop_density), position =
## "stack"): Ignoring unknown aesthetics: weight

plot_density(gardens_df, "Morphotype", "Morphotype", "Densities by Morphotype")
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
## Warning in geom_bar(stat = "identity", aes(weight = prop_density), position =
## "stack"): Ignoring unknown aesthetics: weight

plot_density(gardens_df, "Family", "Family", "Densities by Family")
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
## Warning in geom_bar(stat = "identity", aes(weight = prop_density), position =
## "stack"): Ignoring unknown aesthetics: weight

# Calculate sponge-dominated gardens
sponge_dominated <- gardens_df %>%
  group_by(garden_id, Phylum) %>%
  summarise(total_density = sum(density, na.rm = TRUE)) %>%
  group_by(garden_id) %>%
  mutate(proportion = total_density / sum(total_density, na.rm = TRUE)) %>%
  filter(Phylum == "Porifera") %>%
  mutate(is_sponge_dominated = proportion >= 0.7)
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
# Output results
kable(sponge_dominated) %>% kable_styling(font_size = 7, latex_options = c("striped"))
garden_id Phylum total_density proportion is_sponge_dominated
2306-G07A Porifera 2.5407633 0.8775114 TRUE
D02_2304_G02A Porifera 20.4033815 0.9867127 TRUE
D03_2306_G03A Porifera 0.6437813 0.2416753 FALSE
D05_2306_G05A Porifera 0.1644149 0.1264180 FALSE
D07_2304_G07A Porifera 1.5121467 0.1809220 FALSE
D12_2306_G12A Porifera 5.9436183 0.9657297 TRUE
D15_2306_G15A Porifera 0.1561155 0.0665035 FALSE
D18_2306_G18B Porifera 1.5929045 0.6335733 FALSE
# Calculate coral-dominated gardens
coral_dominated <- gardens_df %>%
  group_by(garden_id, Phylum) %>%
  summarise(total_density = sum(density, na.rm = TRUE)) %>%
  group_by(garden_id) %>%
  mutate(proportion = total_density / sum(total_density, na.rm = TRUE)) %>%
  filter(Phylum == "Cnidaria") %>%
  mutate(is_coral_dominated = proportion >= 0.7)
## `summarise()` has grouped output by 'garden_id'. You can override using the
## `.groups` argument.
# Output results
kable(coral_dominated) %>% kable_styling(font_size = 7, latex_options = c("striped"))
garden_id Phylum total_density proportion is_coral_dominated
2306-G07A Cnidaria 0.3546558 0.1224886 FALSE
D02_2304_G02A Cnidaria 0.2747567 0.0132873 FALSE
D03_2306_G03A Cnidaria 2.0200465 0.7583247 TRUE
D05_2306_G05A Cnidaria 1.1361501 0.8735820 TRUE
D07_2304_G07A Cnidaria 6.8458576 0.8190780 TRUE
D12_2306_G12A Cnidaria 0.2109180 0.0342703 FALSE
D15_2306_G15A Cnidaria 2.1913624 0.9334965 TRUE
D18_2306_G18B Cnidaria 0.9212551 0.3664267 FALSE

When comparing both df’s one can see that only Dive 18 Gumpy Ridge has false for both, thus it is a true mixed forest.

6 Plot exact transect length

This code snippet is useful for looking at exact transect lengths - can be used to update the above values if needed. CHECK IF THIS CAN BE USED INSTEAD ABOVE??

###Finding transect length of each dive using a smooth line through time-sorted coordinates

# Function to calculate the length of a smooth curve using SPLINE
calculate_curve_length <- function(df) {
  df <- df %>% arrange(time)
  df$time_numeric <- as.numeric(df$time)
  smooth_lat <- smooth.spline(df$time_numeric, df$Latitude..deg., spar = 1)
  smooth_lon <- smooth.spline(df$time_numeric, df$Longitude..deg., spar = 1)
  
  time_seq <- seq(min(df$time_numeric), max(df$time_numeric), length.out = 100)
  smooth_lat_values <- predict(smooth_lat, time_seq)$y
  smooth_lon_values <- predict(smooth_lon, time_seq)$y
  
  smooth_coords <- data.frame(lat = smooth_lat_values, lon = smooth_lon_values)
  calculate_distance <- function(lat1, lon1, lat2, lon2) {
    distHaversine(cbind(lon1, lat1), cbind(lon2, lat2))
  }
  
  sum(mapply(calculate_distance, head(smooth_coords$lat, -1), head(smooth_coords$lon, -1), 
             tail(smooth_coords$lat, -1), tail(smooth_coords$lon, -1)))
}

# Apply the function to each dive
lengths <- data.a %>%
  group_by(Dive.Name) %>%
  summarise(Length = calculate_curve_length(cur_data()))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `Length = calculate_curve_length(cur_data())`.
## ℹ In group 1: `Dive.Name = "EX2304_DIVE02 Big Bend Canyon"`.
## Caused by warning:
## ! `cur_data()` was deprecated in dplyr 1.1.0.
## ℹ Please use `pick()` instead.
print(lengths) #looks good. I tried to double check if there is any value to looks out of range, doesnt correspond to the transect in Seatube (comparing dives)
## # A tibble: 15 × 2
##    Dive.Name                         Length
##    <chr>                              <dbl>
##  1 EX2304_DIVE02 Big Bend Canyon       436.
##  2 EX2304_DIVE05 Lone Knoll Scarp      301.
##  3 EX2304_DIVE07 Uliaga Mound Take 2   381.
##  4 EX2304_DIVE08 Umnak Canyon          176.
##  5 EX2306_Dive01 Kodiak Shelf          328.
##  6 EX2306_Dive03 Giacomini Seamount    597.
##  7 EX2306_Dive04 Quinn Seamount        431.
##  8 EX2306_Dive05 Surveyor Seamount     728.
##  9 EX2306_Dive06 Durgin Guyot          403.
## 10 EX2306_Dive07 Deep Discover Dome    279.
## 11 EX2306_Dive08 Denson Seamount       531.
## 12 EX2306_Dive12 Noyes Canyon          305.
## 13 EX2306_Dive14 Chatham Seep          593.
## 14 EX2306_Dive15 Middleton Canyon      284.
## 15 EX2306_Dive18 Gumby Ridge           242.
#Calculate total distance of all dives
sum(lengths$Length)
## [1] 6014.612
transect_lengths <- lengths %>%
  mutate(Dive.Name = case_when(
    Dive.Name == "EX2304_DIVE02 Big Bend Canyon" ~"2304_02" ,
    Dive.Name == "EX2304_DIVE05 Lone Knoll Scarp" ~ "2304_05" ,
    Dive.Name == "EX2304_DIVE07 Uliaga Mound Take 2" ~ "2304_07",
    Dive.Name ==  "EX2304_DIVE08 Umnak Canyon" ~ "2304_08",
    Dive.Name ==  "EX2306_Dive01 Kodiak Shelf" ~ "2306_01",
    Dive.Name == "EX2306_Dive03 Giacomini Seamount" ~ "2306_03",
    Dive.Name == "EX2306_Dive04 Quinn Seamount" ~ "2306_04" ,
    Dive.Name == "EX2306_Dive05 Surveyor Seamount" ~ "2306_05" ,
    Dive.Name ==  "EX2306_Dive06 Durgin Guyot" ~ "2306_06",
    Dive.Name == "EX2306_Dive07 Deep Discover Dome" ~ "2306_07",
    Dive.Name ==  "EX2306_Dive08 Denson Seamount" ~ "2306_08",
    Dive.Name == "EX2306_Dive12 Noyes Canyon" ~ "2306_12" ,
    Dive.Name ==  "EX2306_Dive14 Chatham Seep" ~ "2306_14" ,
    Dive.Name ==  "EX2306_Dive15 Middleton Canyon" ~ "2306_15",
    Dive.Name == "EX2306_Dive18 Gumby Ridge" ~"2306_18" ,
    TRUE ~ Dive.Name
  ))

# To verify this function, I wanted to also plot the transects with coordinates and smooth-fitted line
# Function to create a plot for a specific dive - mostly to check if the transect data is ok
plot_transect_for_dive <- function(dive_id) {
  df <- data.a %>% filter(Dive.Name == dive_id) %>% arrange(time)
  df$time_numeric <- as.numeric(df$time)
  
  smooth_lat <- smooth.spline(df$time_numeric, df$Latitude..deg., spar = 1)
  smooth_lon <- smooth.spline(df$time_numeric, df$Longitude..deg., spar = 1)
  
  time_seq <- seq(min(df$time_numeric), max(df$time_numeric), length.out = 100)
  smooth_lat_values <- predict(smooth_lat, time_seq)$y
  smooth_lon_values <- predict(smooth_lon, time_seq)$y
  
  smooth_coords <- data.frame(lat = smooth_lat_values, lon = smooth_lon_values)
  
  ggplot() +
    geom_point(data = df, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue", size = 2) +
    geom_line(data = smooth_coords, aes(x = lon, y = lat), color = "red", size = 1) +
    labs(title = paste("Transect Line for Dive:", dive_id), x = "Longitude", y = "Latitude") +
    theme_minimal()
}

# Plots per dive
unique(data.a$Dive.Name) #14 dives
##  [1] "EX2304_DIVE02 Big Bend Canyon"     "EX2304_DIVE05 Lone Knoll Scarp"   
##  [3] "EX2304_DIVE07 Uliaga Mound Take 2" "EX2304_DIVE08 Umnak Canyon"       
##  [5] "EX2306_Dive01 Kodiak Shelf"        "EX2306_Dive03 Giacomini Seamount" 
##  [7] "EX2306_Dive04 Quinn Seamount"      "EX2306_Dive05 Surveyor Seamount"  
##  [9] "EX2306_Dive06 Durgin Guyot"        "EX2306_Dive07 Deep Discover Dome" 
## [11] "EX2306_Dive08 Denson Seamount"     "EX2306_Dive12 Noyes Canyon"       
## [13] "EX2306_Dive14 Chatham Seep"        "EX2306_Dive15 Middleton Canyon"   
## [15] "EX2306_Dive18 Gumby Ridge"
plot_transect_for_dive("EX2306_Dive01 Kodiak Shelf") #needs range of 1 

plot_transect_for_dive("EX2306_Dive04 Quinn Seamount")

plot_transect_for_dive("EX2306_Dive06 Durgin Guyot")

plot_transect_for_dive("EX2306_Dive08 Denson Seamount")

plot_transect_for_dive("EX2304_DIVE08 Umnak Canyon")

plot_transect_for_dive("EX2306_Dive14 Chatham Seep") #Nope

plot_transect_for_dive("EX2306_Dive18 Gumby Ridge")

plot_transect_for_dive("EX2304_DIVE05 Lone Knoll Scarp") #needs range 1. There is one data point very far away, ignored in the length!

plot_transect_for_dive("EX2306_Dive03 Giacomini Seamount")

plot_transect_for_dive("EX2306_Dive05 Surveyor Seamount")

plot_transect_for_dive("EX2306_Dive07 Deep Discover Dome")

plot_transect_for_dive("EX2306_Dive15 Middleton Canyon")

plot_transect_for_dive("EX2304_DIVE02 Big Bend Canyon")

plot_transect_for_dive("EX2304_DIVE07 Uliaga Mound Take 2")

#Works well except for Dive 14 2306 
#Problem: for 14 it jumps but the reason is not clear to me, 
# I checked with the below code to see if I can get lengths by doing fragments. The length is similar to the one in the above table (from the smooth fitting), so just take this as a good estimate

#Dive 14 
# irst Isolate the dive 
data_d14 <- data.a %>% filter(Dive.Name == "EX2306_Dive14 Chatham Seep")

# Plot the raw path
ggplot(data_d14, aes(x = Longitude..deg., y = Latitude..deg., color = time)) +
  geom_point(size = 2) +                     # Plot points
  geom_line(aes(group = 1), color = "blue") + # Plot path as a line
  labs(title = "Transect Path for Dive 14",
       x = "Longitude",
       y = "Latitude") +
  theme_minimal() +
  scale_color_datetime()  # Color points by time if needed

# Define time intervals
start_time_1 <- ymd_hms("2023-09-07 21:00:00")
end_time_1 <- ymd_hms("2023-09-07 23:59:59")
start_time_2 <- ymd_hms("2023-09-07 18:00:00")
end_time_2 <- ymd_hms("2023-09-07 20:59:59")

# Filter data into two fragments
fragment_1 <- data_d14 %>% filter(time >= start_time_1 & time <= end_time_1)

fragment_2 <- data_d14 %>% filter(time >= start_time_2 & time <= end_time_2)

# Plot the fragments
ggplot() +
  geom_point(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue", size = 2) +
  geom_line(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue") +
  geom_point(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red", size = 2) +
  geom_line(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red") +
  labs(title = "Transect Path for Dive 14 with Time Fragments",
       x = "Longitude",
       y = "Latitude") +
  theme_minimal()

#Ok looks better, now smooth the line through the two fragments and combine to one length

# Apply smoothing
apply_spline_smoothing <- function(df, spar = 0.5) {
  smooth_lat <- smooth.spline(df$time_numeric, df$Latitude..deg., spar = spar)
  smooth_lon <- smooth.spline(df$time_numeric, df$Longitude..deg., spar = spar)
  
  df$lat_smooth <- predict(smooth_lat, df$time_numeric)$y
  df$lon_smooth <- predict(smooth_lon, df$time_numeric)$y
  return(df)
}

# Prepare data for smoothing
fragment_1$time_numeric <- as.numeric(fragment_1$time)
# Apply spline smoothing
fragment_1_smooth <- apply_spline_smoothing(fragment_1, spar = 1.3)

# Plot Fragment 1 with spline smoothing
ggplot() +
  geom_point(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue", size = 2) +
  geom_line(data = fragment_1, aes(x = Longitude..deg., y = Latitude..deg.), color = "blue") +
  geom_line(data = fragment_1_smooth, aes(x = lon_smooth, y = lat_smooth), color = "red") +
  labs(title = "Fragment 1: Transect Path with Spline Smoothing",
       x = "Longitude",
       y = "Latitude") +
  theme_minimal()

# Prepare data for smoothing
fragment_2$time_numeric <- as.numeric(fragment_2$time)
fragment_2_smooth <- apply_spline_smoothing(fragment_2, spar = 0.5)

# Plot Fragment 2 with spline smoothing
ggplot() +
  geom_point(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red", size = 2) +
  geom_line(data = fragment_2, aes(x = Longitude..deg., y = Latitude..deg.), color = "red") +
  geom_line(data = fragment_2_smooth, aes(x = lon_smooth, y = lat_smooth), color = "darkred") +
  labs(title = "Fragment 2: Transect Path with Spline Smoothing",
       x = "Longitude",
       y = "Latitude") +
  theme_minimal()

calculate_path_length <- function(df) {
  # Ensure data is sorted by time
  df <- df %>% arrange(time_numeric)
  # Calculate distances between consecutive points
  distances <- distHaversine(as.matrix(df[, c("lon_smooth", "lat_smooth")]))
  # Sum the distances
  total_length <- sum(distances, na.rm = TRUE)
  return(total_length)
}

# Convert 'time' column to numeric for the function
fragment_1$time_numeric <- as.numeric(fragment_1$time)
fragment_2$time_numeric <- as.numeric(fragment_2$time)

# Calculate path length for Fragment 1
length_fragment_1 <- calculate_path_length(fragment_1_smooth)
print(paste("Length of Fragment 1:", length_fragment_1, "meters"))
## [1] "Length of Fragment 1: 172.394302710488 meters"
# Calculate path length for Fragment 2
length_fragment_2 <- calculate_path_length(fragment_2_smooth)
print(paste("Length of Fragment 2:", length_fragment_2, "meters"))
## [1] "Length of Fragment 2: 427.701379863488 meters"
#For both in total:
length_fragment_1+length_fragment_2 #600 m, so almost same as in the above table... dont know why the plot is so weird. But the value seems okay
## [1] 600.0957

7 DIVERSITY INDICES

7.1 Alpha Diversity

# Shannon's H' (for each dive)
H <- tapply(data.a$Morphotype, data.a$Dive.Name, function(x) {
  species_abundance <- table(x)
  diversity_index <- diversity(species_abundance, index = "shannon")
  return(diversity_index)
})

# Simpsons (for each dive)
S <- tapply(data.a$Morphotype, data.a$Dive.Name, function(x) {
  species_abundance <- table(x)
  diversity_index <- diversity(species_abundance, index = "simpson")
  return(diversity_index)
})

# Observed Richness
richness <- tapply(data.a$Morphotype, data.a$Dive.Name, function(x) {
  species_abundance <- table(x)
  diversity_index <- specnumber(species_abundance)
  return(diversity_index)
})

# Pielou's Evenness
#Evenness is a measure of the relative abundance of the different species in the same area. 
#Low J indicates that 1 or few species dominate the community
evenness <- H/log(richness)

# Create alpha diversity dataframe, include info about environmnent and locations
div.df <- cbind(shannon = H, richness = richness, pielou = evenness, simps = S, ENV2)
div.df$site <-  data.c[,1]

kable(div.df) %>% kable_styling(font_size = 8, latex_options = c("striped"))
shannon richness pielou simps sed_type sed_consistency oxygen depth temp eastness northness rugosity slope bs feature site
EX2304_DIVE02 Big Bend Canyon 0.9983277 14 0.3782895 0.4846525 3 2 2.1962855 2228.7927 1.827136 0.4689163 0.1141978 9.129195 37.35683 121.78761 1 2304_02
EX2304_DIVE05 Lone Knoll Scarp 1.5552652 9 0.7078317 0.7279378 3 2 3.1782767 2777.7042 1.613341 -0.1479803 0.5893032 30.086932 27.89061 18.81481 5 2304_05
EX2304_DIVE07 Uliaga Mound Take 2 2.5956375 33 0.7423514 0.9069230 5 3 0.9290499 657.7798 3.490384 0.1582887 0.0527446 14.017655 40.48497 75.80089 3 2304_07
EX2304_DIVE08 Umnak Canyon 0.0000000 1 NaN 0.0000000 1 1 1.0951864 1460.0114 2.271012 -0.4666053 -0.8508585 7.879064 15.01833 50.47222 1 2304_08
EX2306_Dive01 Kodiak Shelf 1.7023175 15 0.6286137 0.6916563 5 3 3.4607124 3033.9322 1.562592 0.3753842 0.9113762 82.076497 18.81724 82.02124 5 2306_01
EX2306_Dive03 Giacomini Seamount 2.1498557 23 0.6856513 0.8408186 5 3 0.5162314 814.8627 3.100999 0.6368190 0.7685244 53.575081 22.37613 151.49925 3 2306_03
EX2306_Dive04 Quinn Seamount 1.8524738 8 0.8908516 0.8270360 5 3 1.6349495 1929.7082 1.985380 0.8025602 -0.4711487 60.576255 25.13941 125.69265 3 2306_04
EX2306_Dive05 Surveyor Seamount 1.9745089 24 0.6212950 0.8090708 5 3 0.6544988 513.0706 3.779863 0.3423503 0.9244736 61.071748 28.17219 29.21577 3 2306_05
EX2306_Dive06 Durgin Guyot 2.4916258 25 0.7740671 0.8880535 5 3 0.5487520 1131.5768 2.805364 0.8483461 -0.4928408 77.441229 33.81878 159.94237 3 2306_06
EX2306_Dive07 Deep Discover Dome 0.8830443 23 0.2816284 0.3167201 5 3 3.5517937 3227.4160 1.547009 0.6927170 0.7062478 86.166682 18.31629 72.88986 3 2306_07
EX2306_Dive08 Denson Seamount 1.8164333 21 0.5966234 0.6812386 5 3 0.6390790 1362.0356 2.580212 0.4377730 0.8921247 89.790400 27.66106 NaN 3 2306_08
EX2306_Dive12 Noyes Canyon 0.5710963 15 0.2108884 0.2176131 2 1 1.0621382 1544.2669 2.330919 -0.3129675 0.9497064 149.337704 32.59931 79.33240 1 2306_12
EX2306_Dive14 Chatham Seep 1.6203378 19 0.5503044 0.7487424 7 3 0.6125767 719.7000 4.071347 0.0294458 -0.9181381 43.586525 10.58621 170.93689 4 2306_14
EX2306_Dive15 Middleton Canyon 2.2664271 37 0.6276595 0.8001073 5 3 2.1241257 2107.5228 1.820679 -0.5628847 -0.8262024 175.290324 33.84623 149.72432 1 2306_15
EX2306_Dive18 Gumby Ridge 1.8484203 30 0.5434616 0.7088397 3 2 1.3861362 1802.4660 2.089785 -0.2675006 0.9148033 104.594112 24.40463 147.32355 2 2306_18
# NOTE: Dive 8 2304 has only one morphotype, so NA

# can also sort Locations by the median of 'shannon'
# div.df$site <- with(div.df, reorder(site, shannon, median))

plot.shan <- ggplot(div.df, aes(x = site, y = shannon, colour = depth, shape = as.factor(sed_consistency))) + 
  geom_point(size = 3) +
  scale_colour_viridis_c(option = "cividis", begin = 1, end = 0) +
  ylab("Shannon's H") +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
        plot.margin = unit(c(0, 0, 0, 0), "cm"))  # Ensure margins are set
#scale_shape_manual(values = c(15, 16, 17, 18, 19, 20, 21, 22, 23))  # Customize the shapes as needed
plot.shan

plot.rich <- ggplot(div.df, aes(x = site, y = richness, colour = depth, shape = as.factor(sed_consistency))) + 
  geom_point(size = 3) +
  scale_colour_viridis_c(option = "cividis", begin = 1, end = 0) +
  ylab("Richness") +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
        plot.margin = unit(c(0, 0, 0, 0), "cm"))  # Ensure margins are set
#scale_shape_manual(values = c(15, 16, 17, 18, 19, 20, 21, 22, 23))  # Customize the shapes as needed
plot.rich

plot.even <- ggplot(div.df, aes(x = site, y = pielou, colour = depth, shape = as.factor(sed_consistency))) + 
  geom_point(size = 3) +
  scale_colour_viridis_c(option = "cividis", begin = 1, end = 0) +
  ylab("Pielous eveness") +
  xlab("") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4),
        plot.margin = unit(c(0, 0, 0, 0), "cm"))  # Ensure margins are set
#scale_shape_manual(values = c(15, 16, 17, 18, 19, 20, 21, 22, 23))  # Customize the shapes as needed
plot.even
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

> Now, we explore the alpha diversity further

# What impacts have the various environmental factors on the diversity? 

# Define response variables
responses <- c("shannon", "simps", "pielou", "richness")
# Define predictor variables
predictors <- c("depth", "sed_type", "sed_consistency", "temp", "slope", "feature")

# Run linear models and print summaries
lm_results <- lapply(responses, function(resp) {
  cat("\n### Response:", resp, "###\n")
  lapply(predictors, function(pred) {
    cat("\nModel:", resp, "~", pred, "\n")
    print(summary(lm(as.formula(paste(resp, "~", pred)), data = div.df)))
  })})
## 
## ### Response: shannon ###
## 
## Model: shannon ~ depth 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6862 -0.2888  0.2429  0.3811  0.7639 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.1005074  0.4102993   5.119 0.000197 ***
## depth       -0.0002837  0.0002178  -1.303 0.215191    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.707 on 13 degrees of freedom
## Multiple R-squared:  0.1155, Adjusted R-squared:  0.04747 
## F-statistic: 1.698 on 1 and 13 DF,  p-value: 0.2152
## 
## 
## Model: shannon ~ sed_type 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.97387 -0.27041 -0.00444  0.37465  0.73873 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.25333    0.43429   0.583  0.56966   
## sed_type     0.32072    0.09615   3.335  0.00537 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5518 on 13 degrees of freedom
## Multiple R-squared:  0.4611, Adjusted R-squared:  0.4197 
## F-statistic: 11.13 on 1 and 13 DF,  p-value: 0.005369
## 
## 
## Model: shannon ~ sed_consistency 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.08915 -0.24636  0.00232  0.31416  0.62724 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.2808     0.4536  -0.619 0.546561    
## sed_consistency   0.7510     0.1723   4.359 0.000774 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4791 on 13 degrees of freedom
## Multiple R-squared:  0.5938, Adjusted R-squared:  0.5625 
## F-statistic:    19 on 1 and 13 DF,  p-value: 0.0007739
## 
## 
## Model: shannon ~ temp 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5623 -0.4366  0.2014  0.3726  0.8468 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   0.8426     0.5870   1.435    0.175
## temp          0.3169     0.2272   1.395    0.186
## 
## Residual standard error: 0.701 on 13 degrees of freedom
## Multiple R-squared:  0.1302, Adjusted R-squared:  0.06333 
## F-statistic: 1.947 on 1 and 13 DF,  p-value: 0.1863
## 
## 
## Model: shannon ~ slope 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2567 -0.2913  0.2945  0.4506  0.6637 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.73831    0.60606   1.218    0.245
## slope        0.03342    0.02191   1.525    0.151
## 
## Residual standard error: 0.6923 on 13 degrees of freedom
## Multiple R-squared:  0.1518, Adjusted R-squared:  0.08659 
## F-statistic: 2.327 on 1 and 13 DF,  p-value: 0.1511
## 
## 
## Model: shannon ~ feature 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2670 -0.4568  0.1401  0.4252  0.9994 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.0624     0.4214   2.521   0.0255 *
## feature       0.2046     0.1394   1.468   0.1660  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6962 on 13 degrees of freedom
## Multiple R-squared:  0.1421, Adjusted R-squared:  0.07615 
## F-statistic: 2.154 on 1 and 13 DF,  p-value: 0.166
## 
## 
## ### Response: simps ###
## 
## Model: simps ~ depth 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.66476 -0.05015  0.07641  0.18155  0.20662 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  8.026e-01  1.538e-01   5.217 0.000166 ***
## depth       -9.442e-05  8.165e-05  -1.156 0.268330    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2651 on 13 degrees of freedom
## Multiple R-squared:  0.09327,    Adjusted R-squared:  0.02352 
## F-statistic: 1.337 on 1 and 13 DF,  p-value: 0.2683
## 
## 
## Model: simps ~ sed_type 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41738 -0.09893  0.06601  0.13034  0.24149 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.11497    0.15474   0.743  0.47070   
## sed_type     0.12383    0.03426   3.614  0.00314 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1966 on 13 degrees of freedom
## Multiple R-squared:  0.5012, Adjusted R-squared:  0.4629 
## F-statistic: 13.06 on 1 and 13 DF,  p-value: 0.003144
## 
## 
## Model: simps ~ sed_consistency 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.45610 -0.05262  0.02729  0.09162  0.23267 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.05984    0.16851  -0.355 0.728208    
## sed_consistency  0.27755    0.06400   4.337 0.000806 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.178 on 13 degrees of freedom
## Multiple R-squared:  0.5913, Adjusted R-squared:  0.5599 
## F-statistic: 18.81 on 1 and 13 DF,  p-value: 0.0008061
## 
## 
## Model: simps ~ temp 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.62117 -0.08454  0.10906  0.16926  0.23958 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.35308    0.21722   1.625    0.128
## temp         0.11805    0.08405   1.404    0.184
## 
## Residual standard error: 0.2594 on 13 degrees of freedom
## Multiple R-squared:  0.1317, Adjusted R-squared:  0.06496 
## F-statistic: 1.973 on 1 and 13 DF,  p-value: 0.1836
## 
## 
## Model: simps ~ slope 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53404 -0.11135  0.08585  0.16159  0.25713 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.390288   0.232378   1.680    0.117
## slope       0.009572   0.008400   1.139    0.275
## 
## Residual standard error: 0.2654 on 13 degrees of freedom
## Multiple R-squared:  0.0908, Adjusted R-squared:  0.02087 
## F-statistic: 1.298 on 1 and 13 DF,  p-value: 0.2751
## 
## 
## Model: simps ~ feature 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47315 -0.15600  0.01177  0.16446  0.32696 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.37498    0.14703   2.550   0.0242 *
## feature      0.09816    0.04865   2.018   0.0648 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2429 on 13 degrees of freedom
## Multiple R-squared:  0.2385, Adjusted R-squared:  0.1799 
## F-statistic: 4.071 on 1 and 13 DF,  p-value: 0.06476
## 
## 
## ### Response: pielou ###
## 
## Model: pielou ~ depth 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38699 -0.08173  0.01656  0.11165  0.31556 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.883e-01  1.112e-01   6.191 4.65e-05 ***
## depth       -5.859e-05  5.819e-05  -1.007    0.334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1884 on 12 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0779, Adjusted R-squared:  0.001062 
## F-statistic: 1.014 on 1 and 12 DF,  p-value: 0.3339
## 
## 
## Model: pielou ~ sed_type 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33780 -0.09387  0.00870  0.10874  0.27142 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.31047    0.17909   1.734    0.109
## sed_type     0.06179    0.03837   1.610    0.133
## 
## Residual standard error: 0.1779 on 12 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1777, Adjusted R-squared:  0.1092 
## F-statistic: 2.594 on 1 and 12 DF,  p-value: 0.1333
## 
## 
## Model: pielou ~ sed_consistency 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36796 -0.08586 -0.02145  0.08578  0.24127 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)      0.13677    0.19078   0.717   0.4872  
## sed_consistency  0.17094    0.07034   2.430   0.0317 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1606 on 12 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3298, Adjusted R-squared:  0.274 
## F-statistic: 5.906 on 1 and 12 DF,  p-value: 0.03172
## 
## 
## Model: pielou ~ temp 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.37111 -0.09140  0.03547  0.10045  0.32491 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.47373    0.16223   2.920   0.0128 *
## temp         0.04645    0.06227   0.746   0.4701  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1918 on 12 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.04431,    Adjusted R-squared:  -0.03533 
## F-statistic: 0.5564 on 1 and 12 DF,  p-value: 0.4701
## 
## 
## Model: pielou ~ slope 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38721 -0.03211  0.02922  0.11506  0.30608 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 0.539835   0.189232   2.853   0.0145 *
## slope       0.001787   0.006675   0.268   0.7934  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1956 on 12 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.00594,    Adjusted R-squared:  -0.0769 
## F-statistic: 0.07171 on 1 and 12 DF,  p-value: 0.7934
## 
## 
## Model: pielou ~ feature 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31627 -0.09741  0.00491  0.13028  0.29295 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.40128    0.11723   3.423  0.00505 **
## feature      0.06554    0.03761   1.742  0.10698   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1753 on 12 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2019, Adjusted R-squared:  0.1354 
## F-statistic: 3.036 on 1 and 12 DF,  p-value: 0.107
## 
## 
## ### Response: richness ###
## 
## Model: richness ~ depth 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.442  -4.737   0.281   5.590  18.387 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.566428   5.682605   4.323 0.000827 ***
## depth       -0.002825   0.003016  -0.937 0.366057    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.791 on 13 degrees of freedom
## Multiple R-squared:  0.06321,    Adjusted R-squared:  -0.008853 
## F-statistic: 0.8772 on 1 and 13 DF,  p-value: 0.3661
## 
## 
## Model: richness ~ sed_type 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0445  -6.9838   0.9555   2.5466  14.9555 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    6.741      6.978   0.966   0.3517  
## sed_type       3.061      1.545   1.981   0.0691 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.866 on 13 degrees of freedom
## Multiple R-squared:  0.2319, Adjusted R-squared:  0.1728 
## F-statistic: 3.925 on 1 and 13 DF,  p-value: 0.06913
## 
## 
## Model: richness ~ sed_consistency 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.0345  -5.5690  -0.0345   3.8966  13.9655 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)        2.241      8.132   0.276   0.7872  
## sed_consistency    6.931      3.088   2.244   0.0429 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.588 on 13 degrees of freedom
## Multiple R-squared:  0.2792, Adjusted R-squared:  0.2238 
## F-statistic: 5.037 on 1 and 13 DF,  p-value: 0.04286
## 
## 
## Model: richness ~ temp 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.1891  -5.2212  -0.1077   5.1200  19.2789 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   11.786      8.143   1.447    0.171
## temp           3.260      3.151   1.035    0.320
## 
## Residual standard error: 9.724 on 13 degrees of freedom
## Multiple R-squared:  0.07608,    Adjusted R-squared:  0.005004 
## F-statistic:  1.07 on 1 and 13 DF,  p-value: 0.3197
## 
## 
## Model: richness ~ slope 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.417  -9.330   1.716   6.623  13.704 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   7.3337     8.0831   0.907    0.381
## slope         0.4716     0.2922   1.614    0.131
## 
## Residual standard error: 9.233 on 13 degrees of freedom
## Multiple R-squared:  0.1669, Adjusted R-squared:  0.1029 
## F-statistic: 2.605 on 1 and 13 DF,  p-value: 0.1305
## 
## 
## Model: richness ~ feature 
## 
## Call:
## lm(formula = as.formula(paste(resp, "~", pred)), data = div.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.037  -6.537   1.390   4.890  15.963 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  21.7513     6.0934   3.570  0.00343 **
## feature      -0.7139     2.0163  -0.354  0.72896   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.07 on 13 degrees of freedom
## Multiple R-squared:  0.009552,   Adjusted R-squared:  -0.06664 
## F-statistic: 0.1254 on 1 and 13 DF,  p-value: 0.729
# Only sediment consistency has significant effect on alpha diversity evenness!

#Plot with smoothing regression to show trend...
depth.reg <- ggplot(div.df, aes(x = depth, y = shannon, colour = site)) +
  geom_smooth(method = "lm", colour = "black", fill = "grey90") +
  geom_point(size = 3) +
  scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
  xlab(bquote(Depth (m))) + 
  ylab("Shannon's H'") +
  ylim(0,3.5) +
  theme_bw()
depth.reg 
## `geom_smooth()` using formula = 'y ~ x'

sediments.reg <- ggplot(div.df, aes(x = sed_consistency, y = shannon, colour = site)) +
  geom_point(size = 3) +
  scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
  xlab("Primary sediment consistency (m)") + 
  ylab("Shannon's H'") +
  ylim(0,3.5) +
  theme_bw()
sediments.reg 

# Run linear models and print summaries
anova_results <- lapply(responses, function(resp) {
  cat("\n### Response:", resp, "###\n")
  lapply(predictors, function(pred) {
    cat("\nModel:", resp, "~", pred, "\n")
    print(summary(aov(as.formula(paste(resp, "~", pred)), data = div.df)))
  })})
## 
## ### Response: shannon ###
## 
## Model: shannon ~ depth 
##             Df Sum Sq Mean Sq F value Pr(>F)
## depth        1  0.849  0.8485   1.698  0.215
## Residuals   13  6.497  0.4998               
## 
## Model: shannon ~ sed_type 
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## sed_type     1  3.387   3.387   11.12 0.00537 **
## Residuals   13  3.958   0.304                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Model: shannon ~ sed_consistency 
##                 Df Sum Sq Mean Sq F value   Pr(>F)    
## sed_consistency  1  4.362   4.362      19 0.000774 ***
## Residuals       13  2.984   0.230                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Model: shannon ~ temp 
##             Df Sum Sq Mean Sq F value Pr(>F)
## temp         1  0.957  0.9567   1.947  0.186
## Residuals   13  6.389  0.4915               
## 
## Model: shannon ~ slope 
##             Df Sum Sq Mean Sq F value Pr(>F)
## slope        1  1.115  1.1153   2.327  0.151
## Residuals   13  6.230  0.4793               
## 
## Model: shannon ~ feature 
##             Df Sum Sq Mean Sq F value Pr(>F)
## feature      1  1.044  1.0441   2.154  0.166
## Residuals   13  6.302  0.4847               
## 
## ### Response: simps ###
## 
## Model: simps ~ depth 
##             Df Sum Sq Mean Sq F value Pr(>F)
## depth        1 0.0940 0.09397   1.337  0.268
## Residuals   13 0.9135 0.07027               
## 
## Model: simps ~ sed_type 
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## sed_type     1 0.5050  0.5050   13.06 0.00314 **
## Residuals   13 0.5025  0.0387                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Model: simps ~ sed_consistency 
##                 Df Sum Sq Mean Sq F value   Pr(>F)    
## sed_consistency  1 0.5957  0.5957   18.81 0.000806 ***
## Residuals       13 0.4117  0.0317                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Model: simps ~ temp 
##             Df Sum Sq Mean Sq F value Pr(>F)
## temp         1 0.1327 0.13273   1.973  0.184
## Residuals   13 0.8747 0.06729               
## 
## Model: simps ~ slope 
##             Df Sum Sq Mean Sq F value Pr(>F)
## slope        1 0.0915 0.09148   1.298  0.275
## Residuals   13 0.9160 0.07046               
## 
## Model: simps ~ feature 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## feature      1 0.2403 0.24026   4.071 0.0648 .
## Residuals   13 0.7672 0.05902                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ### Response: pielou ###
## 
## Model: pielou ~ depth 
##             Df Sum Sq Mean Sq F value Pr(>F)
## depth        1 0.0360 0.03599   1.014  0.334
## Residuals   12 0.4259 0.03549               
## 1 observation deleted due to missingness
## 
## Model: pielou ~ sed_type 
##             Df Sum Sq Mean Sq F value Pr(>F)
## sed_type     1 0.0821 0.08209   2.594  0.133
## Residuals   12 0.3798 0.03165               
## 1 observation deleted due to missingness
## 
## Model: pielou ~ sed_consistency 
##                 Df Sum Sq Mean Sq F value Pr(>F)  
## sed_consistency  1 0.1524  0.1524   5.906 0.0317 *
## Residuals       12 0.3096  0.0258                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
## 
## Model: pielou ~ temp 
##             Df Sum Sq Mean Sq F value Pr(>F)
## temp         1 0.0205 0.02047   0.556   0.47
## Residuals   12 0.4415 0.03679               
## 1 observation deleted due to missingness
## 
## Model: pielou ~ slope 
##             Df Sum Sq Mean Sq F value Pr(>F)
## slope        1 0.0027 0.00274   0.072  0.793
## Residuals   12 0.4592 0.03826               
## 1 observation deleted due to missingness
## 
## Model: pielou ~ feature 
##             Df Sum Sq Mean Sq F value Pr(>F)
## feature      1 0.0933 0.09327   3.036  0.107
## Residuals   12 0.3687 0.03072               
## 1 observation deleted due to missingness
## 
## ### Response: richness ###
## 
## Model: richness ~ depth 
##             Df Sum Sq Mean Sq F value Pr(>F)
## depth        1   84.1   84.09   0.877  0.366
## Residuals   13 1246.3   95.87               
## 
## Model: richness ~ sed_type 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## sed_type     1  308.5  308.52   3.925 0.0691 .
## Residuals   13 1021.9   78.61                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Model: richness ~ sed_consistency 
##                 Df Sum Sq Mean Sq F value Pr(>F)  
## sed_consistency  1  371.5   371.5   5.037 0.0429 *
## Residuals       13  958.9    73.8                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Model: richness ~ temp 
##             Df Sum Sq Mean Sq F value Pr(>F)
## temp         1  101.2  101.21    1.07   0.32
## Residuals   13 1229.2   94.55               
## 
## Model: richness ~ slope 
##             Df Sum Sq Mean Sq F value Pr(>F)
## slope        1  222.1  222.10   2.605  0.131
## Residuals   13 1108.3   85.25               
## 
## Model: richness ~ feature 
##             Df Sum Sq Mean Sq F value Pr(>F)
## feature      1   12.7   12.71   0.125  0.729
## Residuals   13 1317.7  101.36
#Explore the values combined for hard, medium, soft...
sed.plot <- ggplot(div.df, aes(x = sed_consistency, y = shannon, fill = as.factor(sed_consistency))) +
  geom_boxplot(aes(fill = as.factor(sed_consistency))) +
  geom_point(size = 3, aes(colour = site)) + 
  scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
  scale_fill_manual(values = c("grey60", "grey90", "black"), guide = FALSE) +
  ylab("Shannon's H'") + 
  xlab('')+
  theme_bw() 
sed.plot
## Warning: The `guide` argument in `scale_*()` cannot be `FALSE`. This was deprecated in
## ggplot2 3.3.4.
## ℹ Please use "none" instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Explore the values combined for hard, medium, soft...
sed.plot2 <- ggplot(div.df, aes(x = sed_type, y = shannon, fill = as.factor(sed_type))) +
  geom_boxplot(aes(fill = as.factor(sed_type))) +
  geom_point(size = 3, aes(colour = site)) + 
  scale_colour_viridis_d(option = "magma", begin = 0.2, end = 0.8) +
  #scale_fill_manual(guide = FALSE) +
  ylab("Shannon's H'") + 
  xlab('')+
  theme_bw() 
sed.plot2 

# NOTE: When adjusting the depth bins there was no obvious difference 

Summary of the alpha diversity: Only sediments had a significant effect on the within-side diversities. Hard substrates show the highest Shannon. Uliaga seamount is the most diverse site, and Umnak Canyon the lowest with a index of 0 with only 1 morphotype. There was no clear trend on feature, but a slight negative trend with depth.

7.2 Beta Diversity

# To calculate Beta diversity (across sites) pairwise dissimilarities/distances need to be calculated
# Prepare data: Convert morphotype data to presence-absence matrix
SPE.pa <- SPE2
SPE.pa[SPE.pa > 0] <- 1

# Calculate beta diversity using Bray-Curtis dissimilarity
bray.df <- vegdist(SPE.pa, method = "bray")
# Optionally, convert dissimilarity matrix to dataframe
bray.df <- as.data.frame(as.matrix(bray.df))

# Set the row names of the distance matrix to Dive Names
rownames(bray.df) <- data.c$Dive.Name
colnames(bray.df) <- data.c$Dive.Name

#bray.df <- cbind(bray.df, ENV2)

# The matrix can be further used to explore how distant sites are. 
# 1) Are sites more similar that are closer to each other? GEOGRAPHIC DISTANCE

# Create a list to store the first coordinate values and Dive names for each Dive site
first_coordinates_list <- lapply(divesites, function(dive_site) {
  # Extract the first coordinate values for the Dive site
  first_coordinates <- data.a %>%
    filter(Dive.Name == dive_site) %>%
    slice(1) %>%
    dplyr::select(Dive.Name, Latitude..deg., Longitude..deg.)
  return(first_coordinates)
  })

# Extract latitude and longitude values from the list of data frames
coordinates <- lapply(first_coordinates_list, function(df) df[, c("Longitude..deg.", "Latitude..deg.")])
# Convert the list of coordinates to a matrix
coordinates_matrix <- do.call(rbind, coordinates)
# Calculate distances between points using the distm function with Haversine formula (for km)
distance_matrix <- distm(coordinates_matrix)

rownames(distance_matrix) <- data.c$Dive.Name
colnames(distance_matrix) <- data.c$Dive.Name

# Print the distance matrix - if wanted
# print(distance_matrix)
heatmap(distance_matrix)

# Convert depth_distance_matrix to long format
geo_long <- as.data.frame(as.table(distance_matrix))
colnames(geo_long) <- c("Site1", "Site2", "Geo")
# Convert bray_curtis_matrix to long format
bray_long <- as.data.frame(as.table(as.matrix(bray.df)))
colnames(bray_long) <- c("Site1", "Site2", "Bray_Curtis")
# Merge the dataframes on Site1 and Site2
combined_df <- merge(geo_long, bray_long, by = c("Site1", "Site2"))
# Remove rows where Site1 == Site2 (comparisons with self)
combined_df <- combined_df[combined_df$Site1 != combined_df$Site2, ]

ggplot(combined_df, aes(x = Geo, y = Bray_Curtis)) +
  geom_point(aes(color = Bray_Curtis)) +  # Points colored by Bray-Curtis value
  geom_smooth(method = "lm", se = TRUE, color = "black") +  # Linear regression line
   stat_cor(method = "pearson", label.x = 1500000, label.y = 0.55) +  # Add correlation coefficient (r) and p-value
  labs(x = "Geographic Distance (m)", y = "Bray-Curtis Dissimilarity") +
  theme_minimal() +
  scale_color_viridis_c()  # Color scale for Bray-Curtis dissimilarity
## `geom_smooth()` using formula = 'y ~ x'

# 2) Are sites more similar that share similar depths? DEPTH DISTANCE

# Create an empty matrix to store the depth distances
depth_distance_matrix <- matrix(NA, nrow = nrow(data.c), ncol = nrow(data.c)) # using data.c as this has the average depths for each site ready

# Fill the matrix with depth differences
for (i in 1:nrow(data.c)) {
  for (j in 1:nrow(data.c)) {
    # Calculate the absolute difference in depths between locations i and j
    depth_distance_matrix[i, j] <- abs(data.c$depth[i] - data.c$depth[j])
  }}

# Set the row names of the distance matrix to Dive Names
rownames(depth_distance_matrix) <- data.c$Dive.Name
colnames(depth_distance_matrix) <- data.c$Dive.Name

# Convert depth_distance_matrix to long format
depth_long <- as.data.frame(as.table(depth_distance_matrix))
colnames(depth_long) <- c("Site1", "Site2", "Depth")
# Merge the dataframes on Site1 and Site2
combined_df2 <- merge(depth_long, bray_long, by = c("Site1", "Site2"))
# Remove rows where Site1 == Site2 (comparisons with self)
combined_df2 <- combined_df2[combined_df2$Site1 != combined_df2$Site2, ]

ggplot(combined_df2, aes(x = Depth, y = Bray_Curtis)) +
  geom_point(aes(color = Bray_Curtis)) +  # Points colored by Bray-Curtis value
  geom_smooth(method = "lm", se = TRUE, color = "black") +  # Linear regression line
   stat_cor(method = "pearson", label.x = 2000, label.y = 0.55) +  # Add correlation coefficient (r) and p-value. TEST: Linear regression, because the data is pairwaise (testing the relationship between two variables)
  labs(x = "Depth Distance (m)", y = "Bray-Curtis Dissimilarity") +
  theme_minimal() +
  scale_color_viridis_c()  # Color scale for Bray-Curtis dissimilarity
## `geom_smooth()` using formula = 'y ~ x'

Summary of Beta Diversity. Geographic and depth distance have an impact on the beta diversity. The similarity increases when the sites are geographically closer, and even stronger when the depth distance is considered.

8 COMMUNITY COMPOSITION

8.1 PcOA

pcoa.bray <- cmdscale(bray.df, k = 2, eig = T)
# print(pcoa.bray) #explore output if wanted

# extract axis positions for each site from cmdscale object and create a dataframe for plotting
pcoa.bray.plotting <- as.data.frame(pcoa.bray$points)
colnames(pcoa.bray.plotting) <- c("pcoa1", "pcoa2")

# Create alpha diversity dataframe, include info about environment and locations
pcoa.bray.plotting <- cbind(pcoa.bray.plotting, ENV2)

# calculate the proportion of variance in the data which is explained by the first two PCoA axes
pcoa.bray$eig[1]/(sum(pcoa.bray$eig))
## [1] 0.1780102
pcoa.bray$eig[2]/(sum(pcoa.bray$eig))
## [1] 0.1293653
pcoa.bray.plotting$site <- data.c$Dive.Name
pcoa.bray.plotting$watermass <- data.c$watermass

ggplot(pcoa.bray.plotting, aes(x = pcoa1, y = pcoa2, label=site, color=site, shape=watermass)) +
  geom_point(size = 4) +
  #scale_shape_manual(values = c(15, 16, 17, 18, 19))+ 
  #ggtitle("PCA of Bray Curtis Dissimilarities") +
  geom_text_repel(min.segment.length = 0, size=4, box.padding = 1) +
  labs(col = "Site", shape = "Water Mass",
       x = "PCoA1",
       y = "PCoA2", 
       size = 5) +
  theme_linedraw() +
  # Confidence ellipses
  # stat_ellipse(aes(col = current), level = 0.95, linetype = "dashed") +
  theme(panel.grid = element_blank(),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        axis.title.x = element_text (size = 14),
        axis.title.y = element_text(size = 14),
        legend.background = element_blank(),
        legend.box.background= element_rect(colour="black"),
        legend.position = "right")

# Test the significance of watermass and other factors on the distances
# TEST: Adonis2/PERMANOVA. Multivariate test for distance matrix as bray curtis e.g. 
variables <- c("depth", "watermass", "feature", "slope","rugosity","eastness", "sed_consistency","lon","lat")
results <- lapply(variables, function(var) adonis2(as.formula(paste("bray.df ~", var)), data = data.c))
results
## [[1]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs     R2      F Pr(>F)    
## Model     1   0.8436 0.1557 2.3974  0.001 ***
## Residual 13   4.5744 0.8443                  
## Total    14   5.4181 1.0000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[2]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2      F Pr(>F)    
## Model     4   2.3849 0.44019 1.9658  0.001 ***
## Residual 10   3.0331 0.55981                  
## Total    14   5.4181 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[3]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2      F Pr(>F)
## Model     1   0.3557 0.06564 0.9133  0.634
## Residual 13   5.0624 0.93436              
## Total    14   5.4181 1.00000              
## 
## [[4]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2      F Pr(>F)
## Model     1   0.4233 0.07812 1.1017  0.299
## Residual 13   4.9948 0.92188              
## Total    14   5.4181 1.00000              
## 
## [[5]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2     F Pr(>F)  
## Model     1   0.5206 0.09609 1.382  0.072 .
## Residual 13   4.8974 0.90391               
## Total    14   5.4181 1.00000               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[6]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2      F Pr(>F)
## Model     1   0.4412 0.08142 1.1524  0.238
## Residual 13   4.9769 0.91858              
## Total    14   5.4181 1.00000              
## 
## [[7]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2      F Pr(>F)
## Model     1   0.4419 0.08156 1.1544  0.247
## Residual 13   4.9762 0.91844              
## Total    14   5.4181 1.00000              
## 
## [[8]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2      F Pr(>F)  
## Model     1   0.5923 0.10932 1.5955  0.018 *
## Residual 13   4.8258 0.89068                
## Total    14   5.4181 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## [[9]]
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = as.formula(paste("bray.df ~", var)), data = data.c)
##          Df SumOfSqs      R2      F Pr(>F)  
## Model     1   0.5410 0.09986 1.4421  0.048 *
## Residual 13   4.8770 0.90014                
## Total    14   5.4181 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

8.2 Transform data

To continue with the ordination methods, the environmnetal and species data needs to be transformed. This is the case when count data is used, and many zeros are present e.g. Most commonly used are Hellinger transformation for species data, and standardisation for environmental data.

# Morphotype data
# Hellinger transformation

# SPE is the "full" data, so all annotations
SPE.hel <- decostand(SPE, method="hellinger")
plotNormalHistogram(SPE.hel) # checking the distribution

#SPE2 is the aggregated data, grouped by sites
SPE2.hel <- decostand(SPE2, method="hellinger")
plotNormalHistogram(SPE2.hel) # checking the distribution

# Hellinger pre-transformation of the species data 
# Compute square root of relative abundances per site
SPE.hel3 <-decostand(SPE.fam, method="hellinger") # full data, not grouped by site 
SPE.hel3$site <- data.b$Dive.Name 
SPE.hel3 <- SPE.hel3[SPE.hel3$site != "2306_08", ]
SPE.hel3 <- SPE.hel3 %>% dplyr::select(-site) # remove the site column again

# Environmental data
# Standardization

env.z <- decostand(ENV, method="standardize") # After Borcard 
env.z <- na.omit(env.z) # remove the rows with NAs, that is no values. This removes the dive EX2306 Dive 08 as there was no Backscatter data

env2.z <- decostand(ENV2, method= "standardize") # After Borcard 
env2.z <- na.omit(env2.z) # remove the rows with NAs, that is no values. This removes the dive EX2306 Dive 08 as there was no Backscatter data

8.3 Cluster analysis

8.3.1 Morphotype clustering

The cluster analysis is an exploratory data analysis, and can be used to group a set of data into clusters. First, I am exploring the species data (morphotype-level) to assess clustering patterns among different morphotypes per site. The dissimilarity between morphotypes is computed using different clustering methods to find the best fit. I am using the site grouped dataset (SPE2) because I want to see which sites share similar morphotype compositions.

# First: Species data clustering
# Used Borcard's suggestion to explore the species data 

# Set seed for reproducibility
set.seed(123)

# Compute matrix of chord distance among morphotypes
spe.ch <- as.matrix(vegdist(SPE2.hel, "euc")) 
rownames(spe.ch) <- rownames(SPE2.hel)  # Ensure meaningful labels if available
colnames(spe.ch) <- rownames(SPE2.hel) 
spe.ch <- as.dist(spe.ch)

# Compute different hierarchical clustering methods
spe.ch.single <- hclust(spe.ch, method = "single")
spe.ch.complete <- hclust(spe.ch, method = "complete")
spe.ch.UPGMA <- hclust(spe.ch, method = "average") # UPGMA (best cophenetic correlation)
spe.ch.ward <- hclust(spe.ch, method = "ward.D2")

# Plot dendrograms
par(mfrow = c(2,2))  # Arrange plots in a grid
plot(spe.ch.single, main = "Chord - Single linkage", cex = 0.8)
plot(spe.ch.complete, main = "Chord - Complete linkage", cex = 0.8)
plot(spe.ch.UPGMA, main = "Chord - UPGMA", cex = 0.8)
plot(spe.ch.ward, main = "Chord - Ward", cex = 0.8)

par(mfrow = c(1,1))  # Reset plot layout

# Compute cophenetic correlation coefficients to evaluate clustering methods
spe.ch.single.coph <- cophenetic(spe.ch.single)
spe.ch.comp.coph <- cophenetic(spe.ch.complete)
spe.ch.UPGMA.coph <- cophenetic(spe.ch.UPGMA)
spe.ch.ward.coph <- cophenetic(spe.ch.ward)

# Print correlation values
cor(spe.ch, spe.ch.single.coph)  # Single Linkage
## [1] 0.7773492
cor(spe.ch, spe.ch.comp.coph)  # Complete Linkage
## [1] 0.8243592
cor(spe.ch, spe.ch.UPGMA.coph)  # UPGMA (expected highest)
## [1] 0.8797714
cor(spe.ch, spe.ch.ward.coph)  # Ward's
## [1] 0.6844803
# Highest correlation found in the UPGMA clustering with 0.88

# Cluster validation using bootstrap resampling
spech.pv <- pvclust(t(SPE2.hel), method.hclust = "average", method.dist = "euc", parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 13 nodes on host 'localhost'
## Multiscale bootstrap... Done.
plot(spech.pv)

# Highlight clusters with high AU p-values (≥0.95 = significant)
pvrect(spech.pv, alpha = 0.95, pv = "au")

# Alternative: Determine optimal number of clusters using silhouette method
# Define a range of k values to test
k_values <- 2:14
sil_widths <- numeric(length(k_values))

# Compute silhouette width for each k
for (i in seq_along(k_values)) {
  cluster_assignments <- cutree(spe.ch.UPGMA, k = k_values[i])  # Get cluster labels
  sil <- silhouette(cluster_assignments, spe.ch)  # Compute silhouette scores
  sil_widths[i] <- mean(sil[, 3])  # Store average silhouette width
}

# Plot silhouette width vs. number of clusters
plot(k_values, sil_widths, type = "b", pch = 19, frame = FALSE,
     xlab = "Number of Clusters (k)", ylab = "Average Silhouette Width",
     main = "Optimal Cluster Selection via Silhouette Width")
abline(v = k_values[which.max(sil_widths)], col = "red", lty = 2)  # Mark optimal k

cluster_membership <- cutree(spe.ch.UPGMA, k = 7)
table(cluster_membership)
## cluster_membership
## 1 2 3 4 5 6 7 
## 5 2 1 3 1 1 2
# Final reordered dendrogram with chosen clusters
spe.chwo <- reorder.hclust(spe.ch.UPGMA, spe.ch)
plot(spe.chwo, hang = 0, xlab = "Cluster Groups", ylab = "Height", main = "Chord - UPGMA (Reordered)")
rect.hclust(spe.chwo, k = 7)  # Highlight clusters (adjusted k to 7 according to the previous sillhouette plot)

#Rows are dive IDs:
#1) 2304_02 
#2) 2304_05       
#3) 2304_07      
#4) 2304_08      
#5 2306_01      
#6 2306_03       
#7 2306_04      
#8 2306_05      
#9 2306_06      
#10 2306_07  
#11 2306_08
#12 2306_12       
#13 2306_14       
#14 2306_15     
#15 2306_18 

8.3.2 Environmental clustering

Continuing with environmental data. Which sites share cluster together that share similar environmental conditions? Does this correspond to geography? How does it relate to the species clusters?

# Environmental data clustering analysis
# Ensure NA rows are removed for environmental data

# Compute the distance matrix using Euclidean distance
env.ch <-as.matrix(vegdist(na.omit(env2.z), "euc")) # Attach site names to object of class 'dist'/ Take out Dive 2306 08
rownames(env.ch) <- c("2304_02","2304_05", "2304_07" , "2304_08", "2306_01" ,"2306_03", "2306_04", "2306_05", "2306_06" ,"2306_07", "2306_12", "2306_14" ,"2306_15", "2306_18") #w/o Dive 2306 8 as this one has missing env data
colnames(env.ch) <- c("2304_02","2304_05", "2304_07" , "2304_08", "2306_01" ,"2306_03", "2306_04", "2306_05", "2306_06" ,"2306_07", "2306_12", "2306_14" ,"2306_15", "2306_18")
env.ch <- as.dist(env.ch)

# Compute agglomerative clustering methods
env.ch.single <- hclust(env.ch, method = "single")
env.ch.complete <- hclust(env.ch, method = "complete")
env.ch.UPGMA <- hclust(env.ch, method = "average")  # UPGMA
env.ch.ward <- hclust(env.ch, method = "ward.D2")

# Plot dendrograms for different methods
par(mfrow = c(2, 2))  # Arrange plots in a grid
plot(env.ch.single, main = "Chord - Single linkage")
plot(env.ch.complete, main = "Chord - Complete linkage")
plot(env.ch.UPGMA, main = "Chord - UPGMA")
plot(env.ch.ward, main = "Chord - Ward")

par(mfrow = c(1, 1))  # Reset plot layout

# Compute cophenetic correlations to assess clustering method
env.ch.single.coph <- cophenetic(env.ch.single)
env.ch.comp.coph <- cophenetic(env.ch.complete)
env.ch.UPGMA.coph <- cophenetic(env.ch.UPGMA)
env.ch.ward.coph <- cophenetic(env.ch.ward)

# Print correlation values for each method
cor(env.ch, env.ch.single.coph)  # Single Linkage
## [1] 0.676066
cor(env.ch, env.ch.comp.coph)    # Complete Linkage
## [1] 0.7015525
cor(env.ch, env.ch.UPGMA.coph)   # UPGMA (highest correlation)
## [1] 0.7392086
cor(env.ch, env.ch.ward.coph)    # Ward's
## [1] 0.6750676
# Perform p-value bootstrapping to validate clusters (approximate unbiased p-values)
envch.pv <- pvclust(t(env2.z), method.hclust = "average", method.dist = "eucl", parallel = TRUE)
## Creating a temporary cluster...done:
## socket cluster with 13 nodes on host 'localhost'
## Multiscale bootstrap... Done.
plot(envch.pv)
pvrect(envch.pv, alpha = 0.95, pv = "au")  # Highlight significant clusters (AU p-value ≥ 0.95)

# Reorder the dendrogram based on environmental dissimilarities
env.chwo <- reorder.hclust(env.ch.UPGMA, env.ch)
plot(env.chwo, hang = 0, xlab = "3 groups", ylab = "Height", main = "Chord - UPGMA (Reordered)")
rect.hclust(env.chwo, k = 3)  # Highlight 3 clusters

To further explore clustering of species and how they group by similarity or dissimilarity across the sites I perfomed a PCA. Also a PCA on environmental data: To explore how environmental variables group together across the sites. Then a Procrustes Analysis: To assess if the species clustering and environmental clustering are significantly correlated. If yes, proceed with multivariate analysis.

8.4 Explorative PCA

# PCA on Hellinger-transformed species data (SPE.hel)
# Need to delete the Dive 08 as NA in env data
SPE.hel$site <- data.b$Dive.Name 
SPE.hel <- SPE.hel[SPE.hel$site != "2306_08", ]
SPE.hel <- SPE.hel %>% dplyr::select(-site) # remove the site column again
# Now the morphotype for Dive 08 will be only zeros so I need to delete this column with na.omit
spe.pca <- prcomp(SPE.hel[, colSums(SPE.hel != 0) > 0], scale = TRUE)  # You can scale if needed
# PCA plot for species data
biplot(spe.pca, main = "PCA of Species Data") # You can adjust the biplot options to visualize axes and vectors better

#screeplot(spe.pca) 

# PCA on Environmental Data (env.z standardized)
env.pca <- prcomp( env.z[, sapply(env.z, is.numeric)], scale = TRUE) # exclude the previously added site and water mass and feature columns (not needed here anyways) 
biplot(env.pca, main = "PCA of Environmental Data") # Similar to species PCA, biplot shows the principal components

#screeplot(env.pca)

8.4.1 Procustes Analysis

Procrustes analysis tests whether the species composition (PCA on species data) and environmental variables (PCA on environmental data) are significantly correlated. If they are, it suggests that the species distributions are likely influenced by the environmental gradients in your dataset. A significant correlation means that the patterns in species composition are in some way “shaped” by the environmental factors, making CCA a suitable next step.

# 2. Perform Procrustes Analysis
#Test significance  
env.Procrustes.sign <- protest(env.pca, spe.pca, scores="sites") # p < 0.001
env.Procrustes.sign
## 
## Call:
## protest(X = env.pca, Y = spe.pca, scores = "sites") 
## 
## Procrustes Sum of Squares (m12 squared):        0.9017 
## Correlation in a symmetric Procrustes rotation: 0.3135 
## Significance:  0.001 
## 
## Permutation: free
## Number of permutations: 999

Procrustes shows a moderate correlation between species and env data. R = 0.31. Significance (p < 0.001) indicates that the relationship between the environmental and species data is not random. The result suggests that while the correlation is moderate, the association is still meaningful and statistically reliable. Proceed with multivariate analysis (CCA)

9 CCA

Using CCA (Canonical Correspondence Analysis) as this assumes unimodal relationships instead of linear. In nature, this is probably a more realistic assumption.

9.1 Morphotype-level analysis

# Using species data (Hellinger transformed) of the full wide dataset 

# 1) Run a full model 
# This includes all variables that are possible to include (excluding site, lat, lon, watermass). Here it is very important that all NA's are removed, and rows are the same! To double check, use nrow on both df's

cca_full <- cca(SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE) # as a reminder, dive 08 EX2306 is excluded as NA's in backscatter
anova(cca_full, by = "margin") #test each variables significance for the model
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##                  Df ChiSquare      F Pr(>F)    
## depth             1     0.714 9.4782  0.001 ***
## slope             1     0.483 6.4055  0.001 ***
## eastness          1     0.415 5.5046  0.001 ***
## bs                1     0.300 3.9853  0.001 ***
## rugosity          1     0.536 7.1101  0.001 ***
## northness         1     0.291 3.8687  0.001 ***
## sed_consistency   1     0.335 4.4467  0.001 ***
## Residual        420    31.640                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_full) # The model is significant
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##           Df ChiSquare      F Pr(>F)    
## Model      7      3.46 6.5622  0.001 ***
## Residual 420     31.64                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test the significance of individual canonical axes
anova(cca_full, by = "axis")
## Permutation test for cca under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##           Df ChiSquare       F Pr(>F)    
## CCA1       1     0.856 11.3696  0.001 ***
## CCA2       1     0.711  9.4418  0.001 ***
## CCA3       1     0.634  8.4169  0.001 ***
## CCA4       1     0.450  5.9716  0.001 ***
## CCA5       1     0.341  4.5271  0.001 ***
## CCA6       1     0.272  3.6115  0.001 ***
## CCA7       1     0.196  2.5969  1.000    
## Residual 420    31.640                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cca_full)
## 
## Call:
## cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity +      northness + sed_consistency, data = env.z, scale = TRUE) 
## 
## Partitioning of scaled Chi-square:
##               Inertia Proportion
## Total           35.10    1.00000
## Constrained      3.46    0.09859
## Unconstrained   31.64    0.90141
## 
## Eigenvalues, and their contribution to the scaled Chi-square 
## 
## Importance of components:
##                         CCA1    CCA2    CCA3    CCA4     CCA5     CCA6     CCA7
## Eigenvalue            0.8565 0.71127 0.63407 0.44986 0.341034 0.272060 0.195634
## Proportion Explained  0.0244 0.02026 0.01806 0.01282 0.009716 0.007751 0.005574
## Cumulative Proportion 0.0244 0.04467 0.06273 0.07555 0.085263 0.093014 0.098587
##                           CA1     CA2     CA3     CA4     CA5     CA6     CA7
## Eigenvalue            0.92820 0.89690 0.88058 0.87528 0.86187 0.85492 0.82628
## Proportion Explained  0.02644 0.02555 0.02509 0.02494 0.02455 0.02436 0.02354
## Cumulative Proportion 0.12503 0.15058 0.17567 0.20061 0.22516 0.24952 0.27306
##                           CA8    CA9    CA10   CA11    CA12    CA13    CA14
## Eigenvalue            0.80171 0.7793 0.73245 0.6878 0.65562 0.62515 0.61543
## Proportion Explained  0.02284 0.0222 0.02087 0.0196 0.01868 0.01781 0.01753
## Cumulative Proportion 0.29590 0.3181 0.33897 0.3586 0.37724 0.39506 0.41259
##                          CA15    CA16    CA17    CA18    CA19    CA20    CA21
## Eigenvalue            0.60524 0.55668 0.52228 0.49339 0.48206 0.47576 0.45136
## Proportion Explained  0.01724 0.01586 0.01488 0.01406 0.01373 0.01355 0.01286
## Cumulative Proportion 0.42983 0.44569 0.46057 0.47463 0.48836 0.50192 0.51478
##                         CA22    CA23    CA24    CA25    CA26    CA27    CA28
## Eigenvalue            0.4387 0.42864 0.42077 0.41271 0.39748 0.39641 0.37059
## Proportion Explained  0.0125 0.01221 0.01199 0.01176 0.01132 0.01129 0.01056
## Cumulative Proportion 0.5273 0.53949 0.55148 0.56323 0.57456 0.58585 0.59641
##                           CA29     CA30     CA31     CA32     CA33     CA34
## Eigenvalue            0.342691 0.332219 0.329492 0.322186 0.311197 0.302382
## Proportion Explained  0.009763 0.009465 0.009387 0.009179 0.008866 0.008615
## Cumulative Proportion 0.606173 0.615638 0.625025 0.634204 0.643070 0.651685
##                           CA35     CA36     CA37     CA38     CA39     CA40
## Eigenvalue            0.299892 0.287447 0.277909 0.274403 0.269177 0.259115
## Proportion Explained  0.008544 0.008189 0.007918 0.007818 0.007669 0.007382
## Cumulative Proportion 0.660229 0.668418 0.676336 0.684154 0.691822 0.699205
##                           CA41     CA42     CA43     CA44     CA45    CA46
## Eigenvalue            0.255704 0.246995 0.239271 0.235706 0.234590 0.23062
## Proportion Explained  0.007285 0.007037 0.006817 0.006715 0.006683 0.00657
## Cumulative Proportion 0.706490 0.713526 0.720343 0.727059 0.733742 0.74031
##                           CA47     CA48     CA49     CA50     CA51     CA52
## Eigenvalue            0.224925 0.221429 0.215082 0.211413 0.205507 0.204427
## Proportion Explained  0.006408 0.006309 0.006128 0.006023 0.005855 0.005824
## Cumulative Proportion 0.746720 0.753029 0.759157 0.765180 0.771035 0.776859
##                           CA53     CA54     CA55     CA56   CA57     CA58
## Eigenvalue            0.203490 0.195685 0.195099 0.189266 0.1860 0.185257
## Proportion Explained  0.005797 0.005575 0.005558 0.005392 0.0053 0.005278
## Cumulative Proportion 0.782656 0.788231 0.793790 0.799182 0.8045 0.809760
##                           CA59     CA60     CA61     CA62     CA63    CA64
## Eigenvalue            0.182492 0.178831 0.172381 0.167678 0.167216 0.16357
## Proportion Explained  0.005199 0.005095 0.004911 0.004777 0.004764 0.00466
## Cumulative Proportion 0.814959 0.820054 0.824965 0.829743 0.834507 0.83917
##                           CA65     CA66     CA67     CA68     CA69     CA70
## Eigenvalue            0.160487 0.159301 0.158040 0.155975 0.155376 0.149038
## Proportion Explained  0.004572 0.004538 0.004503 0.004444 0.004427 0.004246
## Cumulative Proportion 0.843739 0.848277 0.852780 0.857224 0.861650 0.865896
##                           CA71     CA72     CA73     CA74     CA75     CA76
## Eigenvalue            0.148498 0.144828 0.142084 0.141582 0.139678 0.137450
## Proportion Explained  0.004231 0.004126 0.004048 0.004034 0.003979 0.003916
## Cumulative Proportion 0.870127 0.874253 0.878301 0.882335 0.886314 0.890230
##                           CA77    CA78     CA79     CA80     CA81     CA82
## Eigenvalue            0.132265 0.12918 0.124200 0.121700 0.118337 0.115229
## Proportion Explained  0.003768 0.00368 0.003538 0.003467 0.003371 0.003283
## Cumulative Proportion 0.893999 0.89768 0.901217 0.904685 0.908056 0.911339
##                           CA83     CA84     CA85     CA86     CA87     CA88
## Eigenvalue            0.114571 0.108790 0.104867 0.098667 0.097470 0.094529
## Proportion Explained  0.003264 0.003099 0.002988 0.002811 0.002777 0.002693
## Cumulative Proportion 0.914603 0.917702 0.920690 0.923501 0.926278 0.928971
##                          CA89     CA90     CA91     CA92     CA93     CA94
## Eigenvalue            0.09230 0.089326 0.086456 0.082352 0.080620 0.079806
## Proportion Explained  0.00263 0.002545 0.002463 0.002346 0.002297 0.002274
## Cumulative Proportion 0.93160 0.934146 0.936609 0.938955 0.941252 0.943526
##                           CA95     CA96     CA97     CA98     CA99    CA100
## Eigenvalue            0.077474 0.076696 0.076132 0.074435 0.072928 0.072053
## Proportion Explained  0.002207 0.002185 0.002169 0.002121 0.002078 0.002053
## Cumulative Proportion 0.945733 0.947918 0.950087 0.952208 0.954285 0.956338
##                          CA101    CA102    CA103   CA104    CA105   CA106
## Eigenvalue            0.069339 0.067555 0.067260 0.06633 0.064734 0.06422
## Proportion Explained  0.001975 0.001925 0.001916 0.00189 0.001844 0.00183
## Cumulative Proportion 0.958314 0.960238 0.962154 0.96404 0.965888 0.96772
##                          CA107    CA108    CA109    CA110    CA111    CA112
## Eigenvalue            0.060512 0.058473 0.057243 0.053652 0.050705 0.049336
## Proportion Explained  0.001724 0.001666 0.001631 0.001529 0.001445 0.001406
## Cumulative Proportion 0.969442 0.971108 0.972739 0.974267 0.975712 0.977118
##                          CA113    CA114    CA115    CA116    CA117    CA118
## Eigenvalue            0.047588 0.046038 0.045392 0.044616 0.042942 0.039477
## Proportion Explained  0.001356 0.001312 0.001293 0.001271 0.001223 0.001125
## Cumulative Proportion 0.978473 0.979785 0.981078 0.982349 0.983573 0.984697
##                          CA119    CA120    CA121     CA122     CA123     CA124
## Eigenvalue            0.038809 0.037800 0.035187 0.0322444 0.0308331 0.0302128
## Proportion Explained  0.001106 0.001077 0.001002 0.0009186 0.0008784 0.0008608
## Cumulative Proportion 0.985803 0.986880 0.987882 0.9888011 0.9896796 0.9905403
##                           CA125    CA126     CA127     CA128   CA129    CA130
## Eigenvalue            0.0296901 0.026256 0.0252285 0.0244832 0.02317 0.021869
## Proportion Explained  0.0008459 0.000748 0.0007188 0.0006975 0.00066 0.000623
## Cumulative Proportion 0.9913862 0.992134 0.9928530 0.9935505 0.99421 0.994834
##                           CA131     CA132     CA133     CA134     CA135
## Eigenvalue            0.0207268 0.0189054 0.0181229 0.0174626 0.0166323
## Proportion Explained  0.0005905 0.0005386 0.0005163 0.0004975 0.0004739
## Cumulative Proportion 0.9954240 0.9959627 0.9964790 0.9969765 0.9974503
##                           CA136     CA137     CA138     CA139     CA140
## Eigenvalue            0.0147257 0.0138336 0.0127439 0.0120723 0.0103017
## Proportion Explained  0.0004195 0.0003941 0.0003631 0.0003439 0.0002935
## Cumulative Proportion 0.9978699 0.9982640 0.9986271 0.9989710 0.9992645
##                          CA141     CA142     CA143     CA144     CA145
## Eigenvalue            0.009055 0.0066055 0.0048864 0.0039509 1.318e-03
## Proportion Explained  0.000258 0.0001882 0.0001392 0.0001126 3.754e-05
## Cumulative Proportion 0.999522 0.9997107 0.9998499 0.9999625 1.000e+00
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                         CCA1   CCA2   CCA3   CCA4    CCA5    CCA6    CCA7
## Eigenvalue            0.8565 0.7113 0.6341 0.4499 0.34103 0.27206 0.19563
## Proportion Explained  0.2475 0.2055 0.1832 0.1300 0.09855 0.07862 0.05653
## Cumulative Proportion 0.2475 0.4531 0.6363 0.7663 0.86484 0.94347 1.00000
# Function to ...
calculate_proportion_explained <- function(cca_model) {
  constrained_variation <- cca_model$tot.chi - cca_model$CA$tot.chi
  total_variation <- cca_model$tot.chi
  proportion_explained <- constrained_variation / total_variation
  return(proportion_explained)}

calculate_proportion_explained(cca_full) # How much does the full model explain? About 9.86% of the total variation in species composition. This sounds low, but in ecological, marine studies not uncommon. It just means a lot more is shaping the communities, = the unconstrained variation (e.g. species interactions, dispersal strategies etc.)
## [1] 0.09858745
# Check for collinearity of variables - Variance Inflation Factors
vif.cca(cca_full) 
##           depth           slope        eastness              bs        rugosity 
##        1.461075        1.183490        1.561514        1.280600        1.617593 
##       northness sed_consistency 
##        1.322708        1.302115
#VIF < 5: No serious collinearity; VIF 5–10: Moderate collinearity, consider reducing variables; VIF > 10: High collinearity, remove or combine correlated predictors - So all the environmental variables are good to use. 

# Run a second full model, also including site and watermass (just to test their influence on the test)

# First add the watermass as a factor to env.z 
data.b <- data.b[data.b$Dive.Name != "2306_08", ] # first need to also delete Dive 08 EX2306
env.z$site <- as.factor(data.b$Dive.Name) # Add the feature information
env.z$watermass <- as.factor(data.b$watermass) # Add the feature information
env.z$feature <- as.factor(data.b$feature) # Add the feature information

cca_full2 <- cca(SPE.hel ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)

vif.cca(cca_full2) 
##             slope          eastness                bs          rugosity 
##          2.128915          3.236612          1.705460         10.504427 
##   sed_consistency         northness             depth     watermassANSC 
##          1.623899          1.522587         19.069032          8.398055 
##      watermassPDW watermassPDW/LCDW     watermassPSIW 
##          3.760601         17.006028          7.521306
# NOTE: Watermass is categorical, so it appears as individual variables here.
# Depth and watermass are highly correlated - which is to be expected, but won't be used in addition to watermass anyways.But we can check if it adds additional information to the model:

anova(cca_full2, by = "terms") 
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)
##                  Df ChiSquare       F Pr(>F)    
## slope             1    0.5330  7.5354  0.001 ***
## eastness          1    0.5233  7.3994  0.001 ***
## bs                1    0.4219  5.9648  0.001 ***
## rugosity          1    0.5566  7.8692  0.001 ***
## sed_consistency   1    0.4051  5.7279  0.001 ***
## northness         1    0.3065  4.3341  0.001 ***
## depth             1    0.7140 10.0953  0.001 ***
## watermass         4    2.2169  7.8360  0.001 ***
## Residual        416   29.4227                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(cca_full2) # coefficients show how much each environmental variable contributes to each canonical axis (CCA1, CCA2, etc.), which represent gradients in species composition. Larger absolute values indicate stronger relationships between the variable and the axis. Positive/negative values show the direction of the relationship.
##                           CCA1        CCA2        CCA3        CCA4        CCA5
## slope              0.003046283  0.07619436 -0.02444276  0.20700077 -0.45376598
## eastness          -0.034003582  0.01843548 -0.02245020  0.03469815 -0.15048615
## bs                 0.003579928  0.02156912 -0.01572393  0.05604582 -0.08394914
## rugosity          -0.004304628 -0.15690426  0.20063940 -0.16046266  0.52476327
## sed_consistency   -0.013532912 -0.05553178  0.10277548 -0.01944144 -0.31483791
## northness          0.018734916 -0.02479359  0.03650233  0.06076624  0.22421100
## depth             -0.040238619  0.92673831 -1.38652077 -0.91582236  1.15732423
## watermassANSC      2.427885399  1.42073726 -1.23473887 -3.21901654  2.91413419
## watermassPDW      -0.285408697  3.65497172  2.18743869 -0.89810151  0.24238639
## watermassPDW/LCDW -0.100495161 -0.96906124  0.84886983 -1.06846075 -2.39744927
## watermassPSIW      0.051714043  0.79122733 -0.28004008 -3.64077285  2.02298695
##                           CCA6       CCA7        CCA8       CCA9      CCA10
## slope             -0.823454227 -0.6521524  0.63005192 -0.2569045  0.2453663
## eastness           0.008120634  0.7333346  0.27019300  1.0848696 -0.1628281
## bs                -0.291936700  0.3461386  0.02578135  0.3410311  0.3320855
## rugosity           1.383543416  0.3993149  0.59968508  1.4918996 -1.4746398
## sed_consistency    0.213184176  0.7214802  0.61120849 -0.8641202  0.3237361
## northness          0.205112897 -0.3864530  0.40446483  0.1381029  0.9566287
## depth             -2.022439192 -0.8825660  0.05618539 -1.5459913  1.5089164
## watermassANSC      1.192352836 -0.7814902 -0.01502720  2.0775615 -1.4121719
## watermassPDW       2.574846966 -2.4408662 -0.04174918  1.1587748 -1.5393543
## watermassPDW/LCDW  5.023612682  0.3708254  0.21581445  3.0727194 -4.1527051
## watermassPSIW     -1.202364424 -2.6740830 -0.37907830 -0.6328968 -0.4726168
##                        CCA11
## slope             -0.3578369
## eastness          -1.1076067
## bs                 1.0414786
## rugosity           1.5662564
## sed_consistency   -0.1561345
## northness          0.3520567
## depth             -1.8398159
## watermassANSC      4.0886334
## watermassPDW       5.8650142
## watermassPDW/LCDW  7.4747254
## watermassPSIW      2.6916542
calculate_proportion_explained(cca_full2)  # The second full model, that includes also water mass and site, explains much more (16%) - which is to be expected. 
## [1] 0.1617468

Summary so far from the two full models: Depth and water masses are the dominant drivers of community structure. Rugosity & slope also play key roles, but to a lesser extent. Each water mass level affects species composition differently, which is why VIF listed them separately. Next I will explore the conditioned models, to explore each variable separately

# All variables to check for a conditioned model. For fun also including site here
env_vars <- c("depth", "site", "slope", "eastness", "rugosity", "bs", "sed_consistency", "feature", "watermass")

# Create a list to store CCA results
cca_results <- list()

# Run CCA for each variable as a condition
for (var in env_vars) {
  formula <- as.formula(paste("SPE.hel ~", paste(setdiff(env_vars, var), collapse = " + "), 
                              "+ Condition(", var, ")"))
  cca_results[[var]] <- cca(formula, data = env.z, scale = TRUE)
}
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2306_01',
## 'site2306_07', 'site2306_14', 'site2306_18', 'watermassANSC', 'watermassPDW',
## 'watermassPDW/LCDW', 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2304_08',
## 'site2306_04', 'site2306_06', 'site2306_18', 'feature2', 'feature3',
## 'feature4', 'feature5'
# Now extract and print the variance explained (inertia) for each model
for (var in env_vars) {
  inertia <- cca_results[[var]]$CCA$tot.chi  # Total inertia from the CCA model
  constrained_inertia <- cca_results[[var]]$CCA$eig[1]  # Inertia explained by the constrained axis
  variation_explained <- (constrained_inertia / inertia) * 100
  cat("Variation explained by", var, ":", variation_explained, "%\n") # Percent (explained) inertia 
}
## Variation explained by depth : 11.91178 %
## Variation explained by site : 30.31318 %
## Variation explained by slope : 11.44967 %
## Variation explained by eastness : 11.43797 %
## Variation explained by rugosity : 11.57756 %
## Variation explained by bs : 11.32152 %
## Variation explained by sed_consistency : 11.24673 %
## Variation explained by feature : 14.20353 %
## Variation explained by watermass : 16.49963 %

A bit of explanation to the above: The variation explained differs from the results from the previous ANOVA. This is because the CCA approach looks at the total variation explained by all the predictors and how each variable explains variation in the context of all others. When conditioning on one variable, the variation explained by that variable can appear smaller because some of the variation is explained by the other variables, and some is shared or redistributed among them. The ANOVA approach tests the marginal significance of each variable independently, so depth might show strong significance and low p-value because it independently explains a lot of variation in the species data. Basically:

  • ANOVA = Statistical Significance (which variables influence species composition significantly?)
  • PERCENT INERTIA = Explained Variation (relative importance of each environmental variable in explaining species composition)

Now we will run the model we want to plot:

# Remove all variables that are not added based on the results above. I do not include watermass, feature, site (because that doesn't make sense to include if I want to see the clustering, should not bias the ordination, was just used to test the effect. And watermass and feature are categorical, so each mass has a different influence). Also bs and eastness, northness because they have a very weak influence (see the coef results from above, first two CCA axes. <0.04 (or -0.04) in at least one of the axes. The other variables are above that, so I decided to only take those. When I run the model and plotted it including the above variables, they are very close to the center too, as expected.  

# Since this is the case, we could include Dive EX2306 08 again, since we don't end up using backscatter. I decided not to, as it doesn't change the outcome a lot, this dive is just missing from the CCA. I run this separately just to confirm, and the clustering and statistics are similar. Dive 08 clusters also with their respective water mass - PDW - and is close to the other PDW site, Quinn Seamount (EX2306 04), but also close to the other seamounts, as Quinn too. PDW and PSIW are a bit overlapping in both models. 

cca_model <- cca(SPE.hel ~ slope + rugosity + sed_consistency + depth, data = env.z, scale = TRUE)
calculate_proportion_explained(cca_model) 
## [1] 0.06735077
scores.cca <- vegan::scores(cca_model, display=c("sites","cn","bp","sp")) # sites, centroids, biplot, species
cca.sites <- data.frame(scores.cca$sites)
cca.biplot <- data.frame(scores.cca$biplot)
cca.species <- data.frame(scores.cca$species) # get all morphotypes
cca.species <- cca.species[which(abs(cca.species$CCA1) > 0.05 | abs(cca.species$CCA2) > 0.05),] # filter species based on scores

#add sites name
cca.sites$site <- data.b$Dive.Name # Add the feature information
cca.sites$feature <- as.factor(data.b$feature) # Add the feature information
cca.sites$watermass <- as.factor(data.b$watermass) # Add the feature information
cca.sites$sed_consistency <- as.factor(data.b$sed_consistency) # Add the feature information

# Convert row names to a column
cca.species$Morphotype <- rownames(cca.species)
# Merge with taxa_df using cleaned species names
cca_full_df <- merge(cca.species, taxa, by = "Morphotype")

#Maybe add only significant species:
# Step 1: Run envfit to test significance of species with CCA axes
species_fit <- envfit(cca_model, SPE.hel, perm = 999)
# Step 2: Extract species with significant p-values
significant_species <- species_fit$vectors$pvals < 0.05
significant_species_names <- rownames(species_fit$vectors$arrows)[significant_species]
# Step 3: Filter your species data (cca_full_df) to include only significant species
cca_full_df <- cca_full_df[cca_full_df$Morphotype %in% significant_species_names, ]

# Plotting the biplot. I am using water mass as shape, since they were strongly significant. And also explained the highest variation in the model. 

plot.cca <- ggplot(cca.sites, aes(x = CCA1, y = CCA2)) +
  geom_point(aes(x = CCA1, y = CCA2,
                 col = site, shape = watermass,  alpha = 0.9),
             size = 3) + 
  scale_alpha(range = c(0, 1), guide = "none") + # Hide alpha from the legend
  geom_hline(yintercept=0, linetype="dotted") +
  geom_vline(xintercept=0, linetype="dotted") +
  geom_segment(data = cca.biplot,
               aes(x = 0, xend = CCA1*2, y = 0, yend = CCA2*2),
               arrow=arrow(length=unit(0.01,"npc")),
               color = "#C20000") +
  geom_text(data = cca.biplot,
            aes(x = CCA1*2.5, y = CCA2*2.5, label=rownames(cca.biplot)), 
            size = 4,
            color = "#C20000") +
  stat_ellipse(aes(fill=watermass), level = 0.95, alpha=0.3, linetype = "dashed") +
  geom_text(data = cca_full_df, aes(x = CCA1, y = CCA2, label = Morphotype),
            position = position_jitter(width = 0.5, height = 0.5), size = 3, color = "black") +  # Jitter text
  labs(col = "Site", shape = "Water Mass or Current",
       x = "CCA1",
       y = "CCA2", 
       size = 5) +
  theme_linedraw() +
  theme(panel.grid = element_blank(),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        axis.title.x = element_text (size = 14),
        axis.title.y = element_text(size = 14),
        legend.background = element_blank(),
        legend.box.background= element_rect(colour="black"),
        legend.position = "right")

plot.cca

9.2 Family-level analysis

Since the morphotype assignments might also influence the analysis (as for example more sponges are broadly identified to genus or family, and corals are better resolved in that context), lets also do the same on the family level data.

# First test the full model as above

# 1) Run a full model 
# This includes all variables that are possible to include (excluding site, lat, lon, watermass). Here it is very important that all NA's are removed, and rows are the same! To double check, use nrow on both df's

cca_full_family <- cca(SPE.hel3 ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
anova(cca_full, by = "margin") #test each variables significance for the model
## Permutation test for cca under reduced model
## Marginal effects of terms
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##                  Df ChiSquare      F Pr(>F)    
## depth             1     0.714 9.4782  0.001 ***
## slope             1     0.483 6.4055  0.001 ***
## eastness          1     0.415 5.5046  0.001 ***
## bs                1     0.300 3.9853  0.001 ***
## rugosity          1     0.536 7.1101  0.001 ***
## northness         1     0.291 3.8687  0.001 ***
## sed_consistency   1     0.335 4.4467  0.001 ***
## Residual        420    31.640                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(cca_full) 
## Permutation test for cca under reduced model
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##           Df ChiSquare      F Pr(>F)    
## Model      7      3.46 6.5622  0.001 ***
## Residual 420     31.64                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Test the significance of individual canonical axes
anova(cca_full_family, by = "axis")
## Permutation test for cca under reduced model
## Forward tests for axes
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel3 ~ depth + slope + eastness + bs + rugosity + northness + sed_consistency, data = env.z, scale = TRUE)
##           Df ChiSquare       F Pr(>F)    
## CCA1       1    0.5048 28.8098  0.001 ***
## CCA2       1    0.3297 18.8177  0.001 ***
## CCA3       1    0.2477 14.1373  0.001 ***
## CCA4       1    0.1575  8.9872  0.001 ***
## CCA5       1    0.1023  5.8356  0.001 ***
## CCA6       1    0.0621  3.5426  0.001 ***
## CCA7       1    0.0395  2.2534  1.000    
## Residual 420    7.3598                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(cca_full_family)
## 
## Call:
## cca(formula = SPE.hel3 ~ depth + slope + eastness + bs + rugosity +      northness + sed_consistency, data = env.z, scale = TRUE) 
## 
## Partitioning of scaled Chi-square:
##               Inertia Proportion
## Total           8.803      1.000
## Constrained     1.444      0.164
## Unconstrained   7.360      0.836
## 
## Eigenvalues, and their contribution to the scaled Chi-square 
## 
## Importance of components:
##                          CCA1    CCA2    CCA3    CCA4    CCA5     CCA6     CCA7
## Eigenvalue            0.50484 0.32975 0.24773 0.15749 0.10226 0.062079 0.039486
## Proportion Explained  0.05735 0.03746 0.02814 0.01789 0.01162 0.007052 0.004485
## Cumulative Proportion 0.05735 0.09480 0.12294 0.14083 0.15245 0.159500 0.163985
##                           CA1     CA2     CA3     CA4     CA5     CA6    CA7
## Eigenvalue            0.71183 0.64385 0.61809 0.53605 0.46412 0.40354 0.3962
## Proportion Explained  0.08086 0.07314 0.07021 0.06089 0.05272 0.04584 0.0450
## Cumulative Proportion 0.24484 0.31798 0.38819 0.44908 0.50180 0.54764 0.5926
##                           CA8    CA9    CA10    CA11    CA12    CA13   CA14
## Eigenvalue            0.38396 0.3416 0.29900 0.28118 0.26221 0.23971 0.2227
## Proportion Explained  0.04361 0.0388 0.03396 0.03194 0.02978 0.02723 0.0253
## Cumulative Proportion 0.63626 0.6751 0.70902 0.74096 0.77075 0.79798 0.8233
##                          CA15    CA16    CA17   CA18    CA19    CA20    CA21
## Eigenvalue            0.19818 0.18780 0.16911 0.1514 0.14725 0.13287 0.12051
## Proportion Explained  0.02251 0.02133 0.01921 0.0172 0.01673 0.01509 0.01369
## Cumulative Proportion 0.84579 0.86712 0.88633 0.9035 0.92025 0.93535 0.94903
##                         CA22    CA23     CA24     CA25    CA26     CA27
## Eigenvalue            0.1118 0.09387 0.083024 0.062408 0.04305 0.032731
## Proportion Explained  0.0127 0.01066 0.009431 0.007089 0.00489 0.003718
## Cumulative Proportion 0.9617 0.97239 0.981823 0.988912 0.99380 0.997521
##                           CA28
## Eigenvalue            0.021826
## Proportion Explained  0.002479
## Cumulative Proportion 1.000000
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                         CCA1   CCA2   CCA3   CCA4    CCA5    CCA6    CCA7
## Eigenvalue            0.5048 0.3297 0.2477 0.1575 0.10226 0.06208 0.03949
## Proportion Explained  0.3497 0.2284 0.1716 0.1091 0.07083 0.04300 0.02735
## Cumulative Proportion 0.3497 0.5781 0.7497 0.8588 0.92965 0.97265 1.00000
calculate_proportion_explained(cca_full_family) 
## [1] 0.1639855
vif.cca(cca_full_family) # checking again the collinearity, but should be same as before
##           depth           slope        eastness              bs        rugosity 
##        1.440203        1.184746        1.570254        1.302130        1.616738 
##       northness sed_consistency 
##        1.328061        1.306054
# Run a second full model, also including site and watermass (just to test their influence on the test)

# First add the watermass as a factor to env.z 
env.z$site <- as.factor(data.b$Dive.Name) # Add the feature information
env.z$watermass <- as.factor(data.b$watermass) # Add the feature information
env.z$feature <- as.factor(data.b$feature) # Add the feature information

cca_full2_family <- cca(SPE.hel3 ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)

anova(cca_full2_family, by = "terms") 
## Permutation test for cca under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## Model: cca(formula = SPE.hel3 ~ slope + eastness + bs + rugosity + sed_consistency + northness + depth + watermass, data = env.z, scale = TRUE)
##                  Df ChiSquare       F Pr(>F)    
## slope             1    0.2415 15.0588  0.001 ***
## eastness          1    0.2510 15.6515  0.001 ***
## bs                1    0.1410  8.7924  0.001 ***
## rugosity          1    0.1707 10.6436  0.001 ***
## sed_consistency   1    0.2428 15.1401  0.001 ***
## northness         1    0.0969  6.0442  0.001 ***
## depth             1    0.2998 18.6959  0.001 ***
## watermass         4    0.6890 10.7412  0.001 ***
## Residual        416    6.6708                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(cca_full2) # coefficients show how much each environmental variable contributes to each canonical axis (CCA1, CCA2, etc.), which represent gradients in species composition. Larger absolute values indicate stronger relationships between the variable and the axis. Positive/negative values show the direction of the relationship.
##                           CCA1        CCA2        CCA3        CCA4        CCA5
## slope              0.003046283  0.07619436 -0.02444276  0.20700077 -0.45376598
## eastness          -0.034003582  0.01843548 -0.02245020  0.03469815 -0.15048615
## bs                 0.003579928  0.02156912 -0.01572393  0.05604582 -0.08394914
## rugosity          -0.004304628 -0.15690426  0.20063940 -0.16046266  0.52476327
## sed_consistency   -0.013532912 -0.05553178  0.10277548 -0.01944144 -0.31483791
## northness          0.018734916 -0.02479359  0.03650233  0.06076624  0.22421100
## depth             -0.040238619  0.92673831 -1.38652077 -0.91582236  1.15732423
## watermassANSC      2.427885399  1.42073726 -1.23473887 -3.21901654  2.91413419
## watermassPDW      -0.285408697  3.65497172  2.18743869 -0.89810151  0.24238639
## watermassPDW/LCDW -0.100495161 -0.96906124  0.84886983 -1.06846075 -2.39744927
## watermassPSIW      0.051714043  0.79122733 -0.28004008 -3.64077285  2.02298695
##                           CCA6       CCA7        CCA8       CCA9      CCA10
## slope             -0.823454227 -0.6521524  0.63005192 -0.2569045  0.2453663
## eastness           0.008120634  0.7333346  0.27019300  1.0848696 -0.1628281
## bs                -0.291936700  0.3461386  0.02578135  0.3410311  0.3320855
## rugosity           1.383543416  0.3993149  0.59968508  1.4918996 -1.4746398
## sed_consistency    0.213184176  0.7214802  0.61120849 -0.8641202  0.3237361
## northness          0.205112897 -0.3864530  0.40446483  0.1381029  0.9566287
## depth             -2.022439192 -0.8825660  0.05618539 -1.5459913  1.5089164
## watermassANSC      1.192352836 -0.7814902 -0.01502720  2.0775615 -1.4121719
## watermassPDW       2.574846966 -2.4408662 -0.04174918  1.1587748 -1.5393543
## watermassPDW/LCDW  5.023612682  0.3708254  0.21581445  3.0727194 -4.1527051
## watermassPSIW     -1.202364424 -2.6740830 -0.37907830 -0.6328968 -0.4726168
##                        CCA11
## slope             -0.3578369
## eastness          -1.1076067
## bs                 1.0414786
## rugosity           1.5662564
## sed_consistency   -0.1561345
## northness          0.3520567
## depth             -1.8398159
## watermassANSC      4.0886334
## watermassPDW       5.8650142
## watermassPDW/LCDW  7.4747254
## watermassPSIW      2.6916542
# See how much variation each variable explains
cca_results <- list()

# Run CCA for each variable as a condition
for (var in env_vars) {
  formula <- as.formula(paste("SPE.hel3 ~", paste(setdiff(env_vars, var), collapse = " + "), "+ Condition(", var, ")"))
  cca_results[[var]] <- cca(formula, data = env.z, scale = TRUE)
}
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'feature2', 'feature3',
## 'feature4', 'feature5', 'watermassANSC', 'watermassPDW', 'watermassPDW/LCDW',
## 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2306_01',
## 'site2306_07', 'site2306_14', 'site2306_18', 'watermassANSC', 'watermassPDW',
## 'watermassPDW/LCDW', 'watermassPSIW'
## 
## Some constraints or conditions were aliased because they were redundant. This
## can happen if terms are linearly dependent (collinear): 'site2304_08',
## 'site2306_04', 'site2306_06', 'site2306_18', 'feature2', 'feature3',
## 'feature4', 'feature5'
# Now extract and print the variance explained (inertia) for each model
for (var in env_vars) {
  inertia <- cca_results[[var]]$CCA$tot.chi  # Total inertia from the CCA model
  constrained_inertia <- cca_results[[var]]$CCA$eig[1]  # Inertia explained by the constrained axis
  variation_explained <- (constrained_inertia / inertia) * 100
  cat("Variation explained by", var, ":", variation_explained, "%\n") # Percent (explained) inertia 
}
## Variation explained by depth : 18.99515 %
## Variation explained by site : 32.54604 %
## Variation explained by slope : 20.9424 %
## Variation explained by eastness : 18.50835 %
## Variation explained by rugosity : 18.82024 %
## Variation explained by bs : 20.3314 %
## Variation explained by sed_consistency : 15.76682 %
## Variation explained by feature : 20.29565 %
## Variation explained by watermass : 20.68799 %
# Reduced model for the biplot
cca_model_family <- cca(SPE.hel3 ~ slope + rugosity + sed_consistency + depth, data = env.z, scale = TRUE)
calculate_proportion_explained(cca_model_family) 
## [1] 0.1196027
scores.cca <- vegan::scores(cca_model_family, display=c("sites","cn","bp","sp")) # sites, centroids, biplot, species
cca.sites <- data.frame(scores.cca$sites)
cca.biplot <- data.frame(scores.cca$biplot)
cca.species <- data.frame(scores.cca$species) # get all families
cca.species <- cca.species[which(abs(cca.species$CCA1) > 0.05 | abs(cca.species$CCA2) > 0.05),] # filter species based on scores

#add sites name
cca.sites$site <- data.b$Dive.Name # Add the feature information
cca.sites$feature <- as.factor(data.b$feature) # Add the feature information
cca.sites$watermass <- as.factor(data.b$watermass) # Add the feature information
cca.sites$sed_consistency <- as.factor(data.b$sed_consistency) # Add the feature information

# Convert row names to a column
cca.species$Family <- rownames(cca.species)
# Merge with taxa_df using cleaned species names
cca_full_df <- merge(cca.species, taxa, by = "Family")

#Maybe add only significant species:
# Step 1: Run envfit to test significance of species with CCA axes
species_fit <- envfit(cca_model_family, SPE.hel3, perm = 999)
# Step 2: Extract species with significant p-values
significant_species <- species_fit$vectors$pvals < 0.01
significant_species_names <- rownames(species_fit$vectors$arrows)[significant_species]
# Step 3: Filter your species data (cca_full_df) to include only significant species
cca_full_df <- cca_full_df[cca_full_df$Family %in% significant_species_names, ]

# Plotting the biplot. I am using water mass as shape, since they were strongly significant. And also explained the highest variation in the model. 

plot.cca_family <- ggplot(cca.sites, aes(x = CCA1, y = CCA2)) +
  geom_point(aes(x = CCA1, y = CCA2,
                 col = site, shape = watermass,  alpha = 0.9),
             size = 3) + 
  scale_alpha(range = c(0, 1), guide = "none") + # Hide alpha from the legend
  geom_hline(yintercept=0, linetype="dotted") +
  geom_vline(xintercept=0, linetype="dotted") +
  geom_segment(data = cca.biplot,
               aes(x = 0, xend = CCA1*2, y = 0, yend = CCA2*2),
               arrow=arrow(length=unit(0.01,"npc")),
               color = "#C20000") +
  geom_text(data = cca.biplot,
            aes(x = CCA1*2.5, y = CCA2*2.5, label=rownames(cca.biplot)),
            size = 4,
            color = "#C20000") +
  geom_text(data = cca_full_df, aes(x = CCA1, y = CCA2, label = Family),
            position = position_jitter(width = 0.5, height = 0.5), size = 3, color = "black") +  # Jitter text
  labs(col = "Site", shape = "Water Mass or Current",
       x = "CCA1",
       y = "CCA2", 
       size = 5) +
  theme_linedraw() +
  theme(panel.grid = element_blank(),
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 10),
        axis.title.x = element_text (size = 14),
        axis.title.y = element_text(size = 14),
        legend.background = element_blank(),
        legend.box.background= element_rect(colour="black"),
        legend.position = "right")

plot.cca_family

10 Optional: MAP

#This needs to be run in console because there is prompt asking for the download of the underlying data (map), so not included in Markdown knitted file

library(ggOceanMaps)
#packageVersion("ggOceanMaps")

# Assuming your original data is stored in 'data'
data.map <- data.c %>%
  group_by(Dive.Name) %>%
  dplyr::summarize(
    lat = mean(lat),  # Or use a different method to get unique coordinates
    lon = mean(lon),
    feature = first(feature)  # Keep the feature or other relevant data
  )

# Transform coordinates using WGS84
transform_coord <- function(coords) {
  return(data.frame(
    lon2 = coords$lon,
    lat2 = coords$lat
  ))
}

# Grouping and transforming data
data_transformed <- data.map %>%
  group_by(Dive.Name) %>%
  dplyr::summarize(
    lat = mean(lat),
    lon = mean(lon),
    feature = first(feature)
  ) %>%
  mutate(lon2 = lon, lat2 = lat)

# Plotting the map
# NOTE: if asked a question in console, click YES
#basemap(limits = c(-175, -129, 51, 61), bathy.style = 'poly_grays', crs = 4326) + #bathymetry=TRUE, bathy.style = 'rcb'
#  geom_point(data = data_transformed, aes(x = lon2, y = lat2, color = factor(feature)), size = 3) +
#  geom_text(data = data_transformed, aes(x = lon2, y = lat2, label = Dive.Name), 
#            hjust = 0, vjust = 1.5, size = 3, color = "black") +
#  scale_color_viridis_d(name = "Feature") +
#  ggspatial::annotation_scale(location = "br") + 
#  ggspatial::annotation_north_arrow(location = "tr", which_north = "true") +
# theme_minimal() +
#  labs(title = "Dive Locations Map", x = "Longitude", y = "Latitude")

#Feature No: 1 canyon ; 2 ridge, 3 seamount, 4 seep, 5 slope